# Import Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [2]:
#importing lib to ignore the warnings
import warnings
warnings.filterwarnings("ignore")

# Import  file

In [3]:
def read_file(path):
    file_path = path + 'SAI_Sesha_Alstom_R2' + '.txt'
    file = pd.read_csv(file_path)
    return file

#enter the file path. e.g. C:\\Users\\priyanka\\ or C:/Users/priyanka/
#default file name is SAI_Sesha_Alstom_R2.txt
path = input("Enter file path: ")
print("File path is " + str(path))

file = read_file(path)
file.head()

Enter file path: C:\\Users\\pgulhare\\OneDrive - Capgemini\\ds code\\Untitled Folder\\.ipynb_checkpoints\\
File path is C:\\Users\\pgulhare\\OneDrive - Capgemini\\ds code\\Untitled Folder\\.ipynb_checkpoints\\


Unnamed: 0,Name,Age,Gender,Salary_M,Dep_Family_Members,Pref_Fuel_Type,Location,Sold
0,Mohan,24,Male,70000,0,Diesel,Delhi,0.0
1,Saurabh,43,Male,55000,1,Petrol,Bangalore,1.0
2,Nilesh,36,Male,45000,2,Petrol,Chennai,0.0
3,Viart,30,Male,66000,0,Petrol,Delhi,1.0
4,Sachin,42,Male,71000,2,Diesel,Bangalore,1.0


# Preprocessing

In [4]:
#replacing the null value in Sold

file['derived_sold'] = np.where(( (file['Gender']== 'Female') & (file['Salary_M']>40000) ) , 1.0, 0)
file['Sold'] = file.Sold.fillna(file.derived_sold)

#drop the unrequired columns
file.drop(columns = ['Name','derived_sold'],  inplace=True)

#converting categorical into numerical
file['Gender'] = file['Gender'].apply(lambda x:1 if x=='Male' else 0)
file['Pref_Fuel_Type'] = file['Pref_Fuel_Type'].apply(lambda x:1 if x=='Petrol' else 0)

#encoding the multivariable feature
file = pd.get_dummies(file, prefix=['Loc', 'Mem'], columns=['Location', 'Dep_Family_Members'], drop_first=True)


In [5]:
#feature scaling for Age, Salary_M
x = file.iloc[:,[0,2]]
minmax_sc = preprocessing.MinMaxScaler(feature_range=(0,1))
x_scalar =  minmax_sc.fit_transform(x)

#Converting Numpy into Dataframe
x_scalar = pd.DataFrame(data = x_scalar, columns = ['Age','Salary_M'])

#merging two dataframes
file = pd.concat([file, x_scalar], axis=1)

In [6]:
file.head()

Unnamed: 0,Age,Gender,Salary_M,Pref_Fuel_Type,Sold,Loc_Chennai,Loc_Delhi,Mem_1,Mem_2,Mem_3,Mem_4,Mem_5,Age.1,Salary_M.1
0,24,1,70000,0,0.0,0,1,0,0,0,0,0,0.03125,0.626667
1,43,1,55000,1,1.0,0,0,1,0,0,0,0,0.625,0.426667
2,36,1,45000,1,0.0,1,0,0,1,0,0,0,0.40625,0.293333
3,30,1,66000,1,1.0,0,1,0,0,0,0,0,0.21875,0.573333
4,42,1,71000,0,1.0,0,0,0,1,0,0,0,0.59375,0.64


# Splitting X and y

In [7]:
# Splitting X and y
X = file.drop("Sold", axis=1)
y = file["Sold"]

#train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state =24)

In [8]:
print("y_train unique value count is :" +str(y_train.value_counts()))
print("y_train unique value count is :" +str(y_test.value_counts()))

y_train unique value count is :0.0    9
1.0    6
Name: Sold, dtype: int64
y_train unique value count is :0.0    4
1.0    3
Name: Sold, dtype: int64


# ML Algorithm

# Random Forest

In [9]:
acc = 0
ran_state =0

for i in range(100):
    rfc = RandomForestClassifier(random_state=i)
    rfc.fit(X_train,y_train)
    y_pred = rfc.predict(X_test)
    rfc_accuracy = accuracy_score(y_pred,y_test)
    if(rfc_accuracy>acc):
        acc = rfc_accuracy
        ran_state = i
print("Best random_state is :", str(ran_state))
rf = RandomForestClassifier(random_state=ran_state)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
rf_accuracy = round(accuracy_score(y_pred,y_test)*100,2)
print("Random Forest  is: "+str(rf_accuracy)+" %")


Best random_state is : 77
Random Forest  is: 100.0 %


# Decision Tree

In [10]:
acc = 0
ran_state =0
max_dep = 1

for i in range(1,9):
    for j in range(100):
        dtc = DecisionTreeClassifier(criterion="entropy", random_state = j,  max_depth = i)
        dtc.fit(X_train,y_train)
        y_pred = dtc.predict(X_test)
        dtc_accuracy = accuracy_score(y_pred,y_test)
        if(dtc_accuracy>acc):
            acc = dtc_accuracy
            max_dep = i
            ran_state = j
            #print(i,":",j)

print("Best max_depth is :", str(max_dep))
print("Best random_state is :", str(ran_state))
dt = DecisionTreeClassifier(criterion="entropy", random_state = ran_state,  max_depth = max_dep)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
dt_accuracy = round(accuracy_score(y_pred,y_test)*100,2)
print("Decision Tree  is: "+str(dt_accuracy)+" %")

Best max_depth is : 2
Best random_state is : 0
Decision Tree  is: 85.71 %


# Logistic Regression

In [11]:
lg = LogisticRegression()
lg.fit(X_train, y_train)
y_pred = lg.predict(X_test)
lg_accuracy = round(accuracy_score(y_pred,y_test)*100,2)
print("Logistic Regression accuracy is: "+str(lg_accuracy)+" %")

Logistic Regression accuracy is: 57.14 %


# Support Vector Machine

In [12]:
svm = SVC(gamma='auto')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
svm_accuracy = round(accuracy_score(y_pred,y_test)*100,2)
print("Support Vector Machine accuracy is: "+str(svm_accuracy)+" %")

Support Vector Machine accuracy is: 57.14 %


# Model Accuracy consolidated

In [13]:
df_result = pd.DataFrame({
    'Model':    ['RandomForest', 'LogisticRegression', 'Support Vector Machine', 'Decision Tree'],
    'Accuracy': [rf_accuracy, lg_accuracy, svm_accuracy, dt_accuracy]})
finalresult = df_result.sort_values(by='Accuracy', ascending=False)
finalresult = finalresult.set_index('Accuracy')
finalresult

Unnamed: 0_level_0,Model
Accuracy,Unnamed: 1_level_1
100.0,RandomForest
85.71,Decision Tree
57.14,LogisticRegression
57.14,Support Vector Machine
