In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder 


In [3]:
#Read train and test data from csv into respective dataframes
train_data= pd.read_csv('train.csv')
X_train =train_data.drop(["Attrition"],axis = 1)
Y_train = train_data[['Attrition']]
print("****************Y_train*****************")
print(Y_train.columns)
X_test= pd.read_csv('test.csv')
print("*********************X_test******************")
print(X_test.columns)
X_train =X_train.drop(["Id"],axis = 1)
X_train =X_train.drop(["EmployeeNumber"],axis = 1)
X_train =X_train.drop(["Behaviour"],axis = 1)
print("***********X_train*************")
print(X_train.columns)

****************Y_train*****************
Index(['Attrition'], dtype='object')
*********************X_test******************
Index(['Id', 'Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'CommunicationSkill',
       'Behaviour'],
      dtype='object')
***********X_train*************
Index(['Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education',
       'EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement',
       'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked'

In [4]:
#Do One Hot Encoding on categorical columns
X_train = pd.get_dummies(X_train, columns=['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime'])
print(X_train.head)


<bound method NDFrame.head of       Age  DistanceFromHome  Education  EnvironmentSatisfaction  \
0      30                 2          3                        3   
1      36                12          4                        3   
2      55                 2          1                        3   
3      39                24          1                        1   
4      37                 3          3                        3   
...   ...               ...        ...                      ...   
1623   42                19          3                        3   
1624   55                 2          1                        3   
1625   25                 9          2                        1   
1626   29                13          3                        1   
1627   29                18          1                        3   

      JobInvolvement  JobSatisfaction  MonthlyIncome  NumCompaniesWorked  \
0                  3                4           2564                   0   
1            

In [5]:
#Fit Logistic Regression model on train data
from sklearn.metrics import accuracy_score
Logreg = LogisticRegression(penalty='l2',C=0.5,random_state=1,solver='lbfgs',max_iter=3000)
Logreg.fit(X_train,Y_train)
Y_pred_train=Logreg.predict(X_train)
print(len(Y_pred_train))
print(Y_pred_train)
Y_pred_prob_train=Logreg.predict_proba(X_train)
Y_pred_prob_train_attrition=Logreg.predict_proba(X_train)[:,1]
Y_pred_prob_train_no_attrition=Logreg.predict_proba(X_train)[:,0]

print("Y_pred_prob_train")
print(len(Y_pred_prob_train))
print(Y_pred_prob_train)
print("Y_pred_prob_train_attriton")
print(len(Y_pred_prob_train_attrition))
print(Y_pred_prob_train_attrition)
print("Y_pred_prob_train_no_attriton")
print(len(Y_pred_prob_train_no_attrition))
print(Y_pred_prob_train_no_attrition)

tn_train, fp_train, fn_train, tp_train = confusion_matrix(Y_train, Y_pred_train).ravel()
TPR_train= tp_train/(tp_train + fn_train)
print("TPR_train is " + str(TPR_train) )
FPR_train=fp_train/(fp_train + tn_train)
print("FPR_train is " + str(FPR_train))
sensitivity_train= tp_train/(tp_train + fn_train)
print("Sensitivity_train is " + str(sensitivity_train))
accuracy_train= (tp_train + tn_train) /(tp_train + tn_train + fp_train + fn_train)
print("ACCURACY_train =" + str(accuracy_score(Y_train,Y_pred_train)))
specificitY_train=1- FPR_train
print("specificitY_train =" +str(specificitY_train))

  y = column_or_1d(y, warn=True)


1628
[0 1 0 ... 1 1 1]
Y_pred_prob_train
1628
[[0.87067062 0.12932938]
 [0.46339447 0.53660553]
 [0.56332291 0.43667709]
 ...
 [0.09849432 0.90150568]
 [0.06197919 0.93802081]
 [0.28134581 0.71865419]]
Y_pred_prob_train_attriton
1628
[0.12932938 0.53660553 0.43667709 ... 0.90150568 0.93802081 0.71865419]
Y_pred_prob_train_no_attriton
1628
[0.87067062 0.46339447 0.56332291 ... 0.09849432 0.06197919 0.28134581]
TPR_train is 0.7643312101910829
FPR_train is 0.2040332147093713
Sensitivity_train is 0.7643312101910829
ACCURACY_train =0.7807125307125307
specificitY_train =0.7959667852906287


In [6]:
#Process Test Data
print(X_test.columns)
X_test =X_test.drop(["Id"],axis = 1)
X_test=X_test.drop(["EmployeeNumber"],axis = 1)
X_test =X_test.drop(["Behaviour"],axis = 1)
print("***********X_test*************")
print(X_test.columns)

Index(['Id', 'Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'CommunicationSkill',
       'Behaviour'],
      dtype='object')
***********X_test*************
Index(['Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education',
       'EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement',
       'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimes

In [7]:
#Do One Hot Encoding on Test categorical columns
print(X_test.columns)
X_test = pd.get_dummies(X_test, columns=['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime'])
print(X_test.head)
print(X_test.columns)


Index(['Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education',
       'EducationField', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement',
       'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'CommunicationSkill'],
      dtype='object')
<bound method NDFrame.head of      Age  DistanceFromHome  Education  EnvironmentSatisfaction  \
0     28                 9          3                        4   
1     31                 6          4                        1   
2     37                 6          3                        3   
3     42                 1          2                        4   
4     45                 4          2                        3   
..   ...               ...     

In [8]:
Y_pred_test=Logreg.predict(X_test)
print("Y_pred_test")
print(len(Y_pred_test))
print(Y_pred_test)
Y_pred_prob_test=Logreg.predict_proba(X_test)
Y_pred_prob_test_attrition=Logreg.predict_proba(X_test)[:,1]
Y_pred_prob_test_no_attrition=Logreg.predict_proba(X_test)[:,0]
print("Y_pred_prob_test")
print(len(Y_pred_prob_test))
print(Y_pred_prob_test)
print("Y_pred_prob_test_attriton")
print(len(Y_pred_prob_test_attrition))
print(Y_pred_prob_test_attrition)
print("Y_pred_prob_test_no_attriton")
print(len(Y_pred_prob_test_no_attrition))
print(Y_pred_prob_test_no_attrition)

Y_pred_test
470
[0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0
 0 1 1 1 1 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 1
 0 1 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1
 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 1 0 0 1 1 0 0 0 1 1 1 0 0 0 0 0
 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0
 1 0 0 1 1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 0
 0 0 0 0 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 1
 0 0 0 1 1 0 1 1 1 0 1 0 0 0 1 1 1 1 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0
 1 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 1 0 0 0 0 1 1 0 0 1 1 1 0 0 0 1 1 1 1 1 0 1 0]
Y_pred_prob_test
470
[[0.87656

In [13]:
test_data= pd.read_csv('test.csv')
Y_test = test_data[['Id']]
print(Y_test.head)
test_data['Attrition-0'] = Y_pred_prob_test[:,0]
test_data['Attrition-1'] = Y_pred_prob_test[:,1]
Y_test['Attrition']= test_data['Attrition-1']
print(Y_test.head)
Y_test.to_csv('Submission.csv',index=False) 

<bound method NDFrame.head of       Id
0      1
1      2
2      3
3      4
4      5
..   ...
465  466
466  467
467  468
468  469
469  470

[470 rows x 1 columns]>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


<bound method NDFrame.head of       Id  Attrition
0      1   0.123438
1      2   0.095527
2      3   0.387144
3      4   0.325663
4      5   0.018985
..   ...        ...
465  466   0.609638
466  467   0.953071
467  468   0.256012
468  469   0.558065
469  470   0.059760

[470 rows x 2 columns]>
