In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.linear_model import LinearRegression # explicit class import from module
from sklearn.linear_model import LogisticRegression # explicit class import from module
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsClassifier #we know where this object comes from
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report


warnings.filterwarnings('ignore') # Code for stopping warnings (deprecation warning, etc.)
pd.set_option('display.max_columns', None) # Code for showing all columns in the dateset, withoud '...' in between.

In [2]:
people = pd.read_csv("../raw_data/people_train.csv")

In [3]:
X = people.drop(columns = 'attrition')
y = people['attrition']

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X,y, random_state = 0)

In [5]:
sm = SMOTE()
X_train_smote, y_train_smote = sm.fit_sample(X_train, y_train)

In [6]:
from collections import Counter
print('before SMOTE :' , Counter(y_train))
print('after SMOTE :' , Counter(y_train_smote))

before SMOTE : Counter({0: 646, 1: 125})
after SMOTE : Counter({0: 646, 1: 646})


# KNeighbors Classifier

In [7]:
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
print(classification_report(y_pred, y_val))

              precision    recall  f1-score   support

           0       0.97      0.84      0.90       251
           1       0.02      0.14      0.04         7

    accuracy                           0.82       258
   macro avg       0.50      0.49      0.47       258
weighted avg       0.95      0.82      0.88       258



In [8]:
model = KNeighborsClassifier()
model.fit(X_train_smote, y_train_smote)
y_pred = model.predict(X_val)
print('accuracy: ',accuracy_score(y_val, y_pred))
print('f1: ',f1_score(y_val, y_pred))
print('recall: ',recall_score(y_val, y_pred))

accuracy:  0.5581395348837209
f1:  0.2692307692307692
recall:  0.5


In [9]:
model = KNeighborsClassifier()
sacc = cross_val_score(model, X_train, y_train, scoring='accuracy')
sf1 = cross_val_score(model, X_train, y_train, scoring='f1')
sre = cross_val_score(model, X_train, y_train, scoring='recall')
print('accuracy: ',sacc.mean())
print('f1: ',sf1.mean())
print('recall: ',sre.mean())

accuracy:  0.813229995810641
f1:  0.014814814814814814
recall:  0.008


In [10]:
model = KNeighborsClassifier()
sacc = cross_val_score(model, X_train_smote, y_train_smote, scoring='accuracy')
sf1 = cross_val_score(model, X_train_smote, y_train_smote, scoring='f1')
sre = cross_val_score(model, X_train_smote, y_train_smote, scoring='recall')
print('accuracy: ',sacc.mean())
print('f1: ',sf1.mean())
print('recall: ',sre.mean())

accuracy:  0.6640806919876686
f1:  0.6995765148211393
recall:  0.7832677400119261


# Logistic Regression


In [11]:
model = LogisticRegression()
sacc = cross_val_score(model, X, y, scoring='accuracy')
sf1 = cross_val_score(model, X, y, scoring='f1')
sre = cross_val_score(model, X, y, scoring='recall')
print('accuracy: ',sacc.mean())
print('f1: ',sf1.mean())
print('recall: ',sre.mean())

accuracy:  0.8775467677006867
f1:  0.4882468987595039
recall:  0.3593582887700535


In [12]:
model = LogisticRegression(class_weight='balanced')
sacc = cross_val_score(model, X, y, scoring='accuracy')
sf1 = cross_val_score(model, X, y, scoring='f1')
sre = cross_val_score(model, X, y, scoring='recall')
print('accuracy: ',sacc.mean())
print('f1: ',sf1.mean())
print('recall: ',sre.mean())

accuracy:  0.7599431683637224
f1:  0.5020598789428383
recall:  0.7306595365418895


In [13]:
model = LogisticRegression()
sacc = cross_val_score(model, X_train_smote, y_train_smote, scoring='accuracy')
sf1 = cross_val_score(model, X_train_smote, y_train_smote, scoring='f1')
sre = cross_val_score(model, X_train_smote, y_train_smote, scoring='recall')
print('accuracy: ',sacc.mean())
print('f1: ',sf1.mean())
print('recall: ',sre.mean())

accuracy:  0.8646763042111878
f1:  0.8342099000468697
recall:  0.8294812164579607


In [14]:
model = LogisticRegression(class_weight='balanced')
sacc = cross_val_score(model, X_train_smote, y_train_smote, scoring='accuracy')
sf1 = cross_val_score(model, X_train_smote, y_train_smote, scoring='f1')
sre = cross_val_score(model, X_train_smote, y_train_smote, scoring='recall')
print('accuracy: ',sacc.mean())
print('f1: ',sf1.mean())
print('recall: ',sre.mean())

accuracy:  0.8615875011223848
f1:  0.8279585192058436
recall:  0.8217531305903399


In [15]:

model = LogisticRegression()
model.fit(X_train,y_train)
model.score(X_val,y_val)

0.8798449612403101

In [16]:
model = LogisticRegression(class_weight='balanced')
model.fit(X_train,y_train)
model.score(X_val,y_val)

0.7945736434108527

In [17]:
model = LogisticRegression()
model.fit(X_train_smote,y_train_smote)
model.score(X_val,y_val)

0.8449612403100775

In [18]:
#análise exploratória e model tunning

# Random Forest Generator

In [19]:
model = RandomForestClassifier(n_estimators=100)
sacc = cross_val_score(model, X_train, y_train, scoring='accuracy')
sf1 = cross_val_score(model, X_train, y_train, scoring='f1')
sre = cross_val_score(model, X_train, y_train, scoring='recall')
print('accuracy: ',sacc.mean())
print('f1: ',sf1.mean())
print('recall: ',sre.mean())

accuracy:  0.8534310850439883
f1:  0.19300310162379128
recall:  0.11200000000000002


In [20]:
model = RandomForestClassifier(n_estimators=100)
sacc = cross_val_score(model, X_train_smote, y_train_smote, scoring='accuracy')
sf1 = cross_val_score(model, X_train_smote, y_train_smote, scoring='f1')
sre = cross_val_score(model, X_train_smote, y_train_smote, scoring='recall')
print('accuracy: ',sacc.mean())
print('f1: ',sf1.mean())
print('recall: ',sre.mean())


accuracy:  0.90340606387118
f1:  0.8763531150276321
recall:  0.848062015503876


In [21]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
model.score(X_val,y_val)

0.8604651162790697

In [22]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train_smote, y_train_smote)
y_pred = model.predict(X_val)
model.score(X_val,y_val)


0.8527131782945736