In [1]:
# Import the required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split

# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree
from sklearn.model_selection import KFold 
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import accuracy_score, classification_report


# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("Resources/original_data.csv")
df.head()

Unnamed: 0,ID,Gender,Own_car,Own_property,Work_phone,Phone,Email,Unemployed,Num_children,Num_family,Account_length,Total_income,Age,Years_employed,Income_type,Education_type,Family_status,Housing_type,Occupation_type,Target
0,5008804,1,1,1,1,0,0,0,0,2,15,427500.0,32.868574,12.435574,Working,Higher education,Civil marriage,Rented apartment,Other,1
1,5008806,1,1,1,0,0,0,0,0,2,29,112500.0,58.793815,3.104787,Working,Secondary / secondary special,Married,House / apartment,Security staff,0
2,5008808,0,0,1,0,1,1,0,0,1,4,270000.0,52.321403,8.353354,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,Sales staff,0
3,5008812,0,0,1,0,0,0,1,0,1,20,283500.0,61.504343,0.0,Pensioner,Higher education,Separated,House / apartment,Other,0
4,5008815,1,1,1,1,1,1,0,0,2,5,270000.0,46.193967,2.10545,Working,Higher education,Married,House / apartment,Accountants,0


In [3]:
#drop unwanted columns 
columns_to_drop = ['Work_phone', 'Phone','Email','ID']
df= df.drop(columns=columns_to_drop)

In [4]:
df.head(5)

Unnamed: 0,Gender,Own_car,Own_property,Unemployed,Num_children,Num_family,Account_length,Total_income,Age,Years_employed,Income_type,Education_type,Family_status,Housing_type,Occupation_type,Target
0,1,1,1,0,0,2,15,427500.0,32.868574,12.435574,Working,Higher education,Civil marriage,Rented apartment,Other,1
1,1,1,1,0,0,2,29,112500.0,58.793815,3.104787,Working,Secondary / secondary special,Married,House / apartment,Security staff,0
2,0,0,1,0,0,1,4,270000.0,52.321403,8.353354,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,Sales staff,0
3,0,0,1,1,0,1,20,283500.0,61.504343,0.0,Pensioner,Higher education,Separated,House / apartment,Other,0
4,1,1,1,0,0,2,5,270000.0,46.193967,2.10545,Working,Higher education,Married,House / apartment,Accountants,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9709 entries, 0 to 9708
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           9709 non-null   int64  
 1   Own_car          9709 non-null   int64  
 2   Own_property     9709 non-null   int64  
 3   Unemployed       9709 non-null   int64  
 4   Num_children     9709 non-null   int64  
 5   Num_family       9709 non-null   int64  
 6   Account_length   9709 non-null   int64  
 7   Total_income     9709 non-null   float64
 8   Age              9709 non-null   float64
 9   Years_employed   9709 non-null   float64
 10  Income_type      9709 non-null   object 
 11  Education_type   9709 non-null   object 
 12  Family_status    9709 non-null   object 
 13  Housing_type     9709 non-null   object 
 14  Occupation_type  9709 non-null   object 
 15  Target           9709 non-null   int64  
dtypes: float64(3), int64(8), object(5)
memory usage: 1.2+ MB


In [11]:
df.dtypes

Gender               int64
Own_car              int64
Own_property         int64
Unemployed           int64
Num_children         int64
Num_family           int64
Account_length       int64
Total_income       float64
Age                float64
Years_employed     float64
Income_type         object
Education_type      object
Family_status       object
Housing_type        object
Occupation_type     object
Target               int64
dtype: object

In [None]:
str_cols = df.dtypes[df.dtypes==object].index.tolist()
str_cols

In [None]:
df.loc[:, str_cols].nunique()

In [None]:
df.Occupation_type.value_counts()

In [None]:
df2 = df.copy()

In [None]:
# Choose a cutoff value and create a list of occupations to be replaced
occupations_to_replace = list(df2['Occupation_type'].value_counts().loc[df2['Occupation_type'].value_counts() < 100].index)

# Replace in dataframe
for cls in occupations_to_replace:
    df2['Occupation_type'] = df2['Occupation_type'].replace(cls,"Other")

# Check to make sure binning was successful
df2['Occupation_type'].value_counts()

In [None]:
df2 = pd.get_dummies(df2, dtype=int)
df2.head()

In [None]:
# # Preprocessing: Convert categorical variables into numerical values
# le = LabelEncoder()
# df['Income_type'] = le.fit_transform(df['Income_type'])
# df['Education_type'] = le.fit_transform(df['Education_type'])
# df['Family_status'] = le.fit_transform(df['Family_status'])
# df['Housing_type'] = le.fit_transform(df['Housing_type'])
# df['Occupation_type'] = le.fit_transform(df['Occupation_type'])
# df.head()

In [None]:
df2.info()

In [None]:
df.describe()

In [None]:
df2.corr()

In [None]:
df2.Target.value_counts()

In [None]:
# #SMOT
# X, y = make_classification(random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)
# >>> pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
# >>> # The pipeline can be used as any other estimator
# >>> # and avoids leaking the test set into the train set
# >>> pipe.fit(X_train, y_train).score(X_test, y_test)
# 0.88
# >>> # An estimator's parameter can be set using '__' syntax
# >>> pipe.set_params(svc__C=10).fit(X_train, y_train).score(X_test, y_test)
# 0.76

In [None]:
df

In [None]:
# we have an imbalance - so it may be hard to predict

In [None]:
# features
X = df2.drop(["Target",], axis=1) # keep ALL features except for the target

# target
y = df2.Target

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25, stratify=y)

print(X_train.shape)
print(X_test.shape)

In [None]:

train, test = train_test_split(data, test_size = 0.3, stratify=data.buy)

In [None]:
# def doClassification(model, X_train, X_test, y_train, y_test):
 
#     # predict
#     train_preds = model.predict(X_train)
#     train_probs = model.predict_proba(X_train)

#     test_preds = model.predict(X_test)
#     test_probs = model.predict_proba(X_test)

#     # evaluate train
#     train_cr = classification_report(y_train, train_preds)
#     train_cm = confusion_matrix(y_train, train_preds)

#     train_report = f"""
#     Train Confusion Matrix: 
#     {train_cm}

#     Train Report: 
#     {train_cr}
#     """
#     print("TRAINING METRICS")
#     print(train_report)
#     print()

#     # train ROC curve
#     # Compute fpr, tpr, thresholds and roc auc
#     fpr, tpr, thresholds = roc_curve(y_train, train_probs[:,1])
#     roc_auc = roc_auc_score(y_train, train_probs[:,1])

#     # Plot ROC curve
#     plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
#     plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
#     plt.xlim([0.0, 1.0])
#     plt.ylim([0.0, 1.0])
#     plt.xlabel('False Positive Rate or (1 - Specifity)')
#     plt.ylabel('True Positive Rate or (Sensitivity)')
#     plt.title('TRAINING Receiver Operating Characteristic')
#     plt.legend(loc="lower right")
#     plt.show()
#     print()
#     print()

#     # evaluate test
#     test_cr = classification_report(y_test, test_preds)
#     test_cm = confusion_matrix(y_test, test_preds)

#     test_report = f"""
#     Test Confusion Matrix: 
#     {test_cm}

#     Test Report: 
#     {test_cr}
#     """
#     print("TESTING METRICS")
#     print(test_report)
#     print()

#     # train ROC curve
#     # Compute fpr, tpr, thresholds and roc auc
#     fpr, tpr, thresholds = roc_curve(y_test, test_probs[:,1])
#     roc_auc = roc_auc_score(y_test, test_probs[:,1])

#     # Plot ROC curve
#     plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
#     plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
#     plt.xlim([0.0, 1.0])
#     plt.ylim([0.0, 1.0])
#     plt.xlabel('False Positive Rate or (1 - Specifity)')
#     plt.ylabel('True Positive Rate or (Sensitivity)')
#     plt.title('TESTING Receiver Operating Characteristic')
#     plt.legend(loc="lower right")
#     plt.show()

In [None]:
 pipe = Pipeline([('scaler', StandardScaler()), ('model', SVC())])
pipe.fit(X_train, y_train).score(X_test, y_test) 0.88

In [None]:
# initialize
lr = LogisticRegression()

# fit
lr.fit(X_train, y_train)

doClassification(lr, X_train, X_test, y_train, y_test)

In [None]:
# initialize
sv = SVC(probability=True)

# fit
sv.fit(X_train, y_train)

doClassification(sv, X_train, X_test, y_train, y_test)

In [None]:
# initialize
knn = KNeighborsClassifier(n_neighbors=7)

# fit
knn.fit(X_train, y_train)

doClassification(knn, X_train, X_test, y_train, y_test)

In [None]:
# initialize
dt = DecisionTreeClassifier(random_state=42)

# fit
dt.fit(X_train_scaled, y_train)

doClassification(dt, X_train_scaled, X_test_scaled, y_train, y_test)

In [None]:
# Decision tree has the lowest number of false negatives (max recall), so this is the best model.

In [None]:
# initialize
rf = RandomForestClassifier(random_state=42)

# fit
rf.fit(X_train_scaled, y_train)

doClassification(rf, X_train_scaled, X_test_scaled, y_train, y_test)

In [None]:
# initialize
ada = AdaBoostClassifier(random_state=42)

# fit
ada.fit(X_train_scaled, y_train)

doClassification(ada, X_train_scaled, X_test_scaled, y_train, y_test)

In [None]:
# initialize
gb = GradientBoostingClassifier(random_state=42)

# fit
gb.fit(X_train_scaled, y_train)

doClassification(gb, X_train_scaled, X_test_scaled, y_train, y_test)

In [None]:
# Decision Tree

In [None]:
text_representation = tree.export_text(dt)
print(text_representation)

In [None]:
dt_feature_names=list(X.columns)
dt_target_names=[str(s) for s in y.unique()]

In [None]:
plt.figure(figsize=(20, 10))
plot_tree(dt, feature_names=dt_feature_names, class_names=dt_target_names, filled=True)
plt.show()

In [None]:
# K-Fold Validation
#Implementing cross validation
 
k = 5
kf = KFold(n_splits=k, random_state=None)
model = LogisticRegression(solver= 'liblinear')
 
acc_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))