In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import KBinsDiscretizer

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data (Change to your path)

In [None]:
data = pd.read_csv('/home/omar/Documents/MAI/Semester2/Advanced-analytics/data/train_month_3_with_target.csv')

#### Simple Data Description

In [None]:
print(str(data.shape[0]) + ' records')
print(str(data.columns.size) + ' attributes','\n')

print(data["target"].value_counts(),'\n')


data_types = data.dtypes
print(data_types)

data.head()

## Split Data

In [None]:
# (run once)
target = data["target"]

#drop target as well as replaced columns customer_birth_date ---> age
#(run once)
data_modified = data.drop(["client_id", "target"], axis = 1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_modified, target, test_size=0.3, random_state=0)


print("Size of train data: {}".format(len(X_train)))
print("Size of test data: {}".format(len(X_test)))

print()
print("labels distribution in train data: ")
print(y_train.value_counts())
print()
print("labels distribution in test data: ")
print(y_test.value_counts())

In [None]:
# Feature Engineering

### (Training Data) customer_since_all, customer_since_bank, customer_birth_date ---> to calculate difference in years

In [None]:
X_train["customer_birth_date"] = pd.to_datetime(X_train["customer_birth_date"])
X_train["age"] = X_train["customer_birth_date"].apply(lambda x : (pd.datetime.now().year - x.year))

X_train["customer_since_all"] = pd.to_datetime(X_train["customer_since_all"])
X_train["customer_since_all_years"] = X_train["customer_since_all"].apply(lambda x : (pd.datetime.now().year - x.year))

X_train["customer_since_bank"] = pd.to_datetime(X_train["customer_since_bank"])
X_train["customer_since_bank_years"] = X_train["customer_since_bank"].apply(lambda x : (pd.datetime.now().year - x.year))

In [None]:
## drop customer_since_all, customer_since_bank, customer_birth_date

X_train.drop(['customer_since_all', 'customer_since_bank', 'customer_birth_date', 'customer_postal_code'], axis=1, inplace=True)

### Checking null values

In [None]:
null_counts = X_train.isnull().sum()
null_counts[null_counts > 0]

In [None]:
## drop customer_education
X_train.drop(['customer_education'], axis=1, inplace=True)

In [None]:
## impute missing values
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_frquent = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

X_train.loc(axis=1)["customer_occupation_code", "customer_children", "customer_relationship"] = imp_frquent.fit_transform(X_train.loc(axis=1)["customer_occupation_code", "customer_children", "customer_relationship"])
X_train.loc(axis=1)[["customer_since_all_years", "customer_since_bank_years"]] = imp_mean.fit_transform(X_train.loc(axis=1)[["customer_since_all_years", "customer_since_bank_years"]])

## encoding categorical features

In [None]:
# categorical features ---> customer_occupation_code, customer_children, customer_relationship, customer_gender, customer_self_employed, visits_distinct_so, visits_distinct_so_areas
X_train["customer_relationship"].unique()

In [None]:
occupation_codes = [f"is_occupation_code_{i}" for i in pd.get_dummies(X_train['customer_occupation_code']).columns]
X_train[occupation_codes] = pd.get_dummies(X_train['customer_occupation_code'])

customer_children_codes = [f"is_customer_children_{i}" for i in pd.get_dummies(X_train['customer_children']).columns]
X_train[customer_children_codes] = pd.get_dummies(X_train['customer_children'])

visitis_distinct_codes = [f"is_visits_distinct_so_{i}" for i in pd.get_dummies(X_train['visits_distinct_so']).columns]
X_train[visitis_distinct_codes] = pd.get_dummies(X_train['visits_distinct_so'])

visitis_distinct_areas_codes = [f"is_visits_distinct_areas_{i}" for i in pd.get_dummies(X_train['visits_distinct_so_areas']).columns]
X_train[visitis_distinct_areas_codes] = pd.get_dummies(X_train['visits_distinct_so_areas'])

In [None]:
X_train['customer_gender'] = X_train['customer_gender'].replace([1,2],[0,1])
X_train['customer_relationship'] = X_train['customer_relationship'].replace(['single','couple'],[0,1])

#### delete encoded columns

In [None]:
X_train.drop(['customer_occupation_code', 'customer_children', 'visits_distinct_so', 'visits_distinct_so_areas'], axis=1, inplace=True)

In [None]:
## try to add some features
# # bal_insurance = bal_insurance_21 + bal_insurance_23
# X_train["bal_insurance"] = X_train["bal_insurance_21"] + X_train["bal_insurance_23"]

# X_train["bal_loan"] = X_train["bal_mortgage_loan"] + X_train["bal_personal_loan"]

# X_train["bal_starter"] = X_train["bal_savings_account_starter"] + X_train["bal_current_account_starter"]

# X_train["bal_money"] = X_train["bal_pension_saving"] + X_train["bal_savings_account"] + X_train["bal_current_account"]

# X_train["cap"] = X_train["cap_life_insurance_decreasing_cap"] + X_train["cap_life_insurance_fixed_cap"]

# X_train.drop(['bal_insurance_21', 'bal_insurance_23', 'bal_mortgage_loan', 'bal_personal_loan', 'bal_savings_account_starter'
#              ,'bal_current_account_starter','cap_life_insurance_decreasing_cap','cap_life_insurance_fixed_cap','bal_pension_saving', 'bal_savings_account', 'bal_current_account'], axis=1, inplace=True)

## Categorizing features

In [None]:
# perform a uniform discretization transform of the dataset
#
#kb1 = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
kb1 = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='uniform')
#kb1 = KBinsDiscretizer(n_bins=50, encode='ordinal', strategy='uniform')

columns = ['bal_insurance_21','bal_insurance_23','bal_mortgage_loan','bal_personal_loan','bal_savings_account_starter',
          'bal_current_account_starter','bal_pension_saving','bal_savings_account','bal_current_account','cap_life_insurance_decreasing_cap',
          'cap_life_insurance_fixed_cap']

X_train[columns] = kb1.fit_transform(X_train[columns])

## Applying same transformation to test data

In [None]:
X_test["customer_birth_date"] = pd.to_datetime(X_test["customer_birth_date"])
X_test["age"] = X_test["customer_birth_date"].apply(lambda x : (pd.datetime.now().year - x.year))

X_test["customer_since_all"] = pd.to_datetime(X_test["customer_since_all"])
X_test["customer_since_all_years"] = X_test["customer_since_all"].apply(lambda x : (pd.datetime.now().year - x.year))

X_test["customer_since_bank"] = pd.to_datetime(X_test["customer_since_bank"])
X_test["customer_since_bank_years"] = X_test["customer_since_bank"].apply(lambda x : (pd.datetime.now().year - x.year))

In [None]:
## drop customer_since_all, customer_since_bank, customer_birth_date

X_test.drop(['customer_since_all', 'customer_since_bank', 'customer_birth_date', 'customer_postal_code'], axis=1, inplace=True)

In [None]:
## drop customer_education
X_test.drop(['customer_education'], axis=1, inplace=True)

In [None]:
X_test.loc(axis=1)["customer_occupation_code", "customer_children", "customer_relationship"] = imp_frquent.transform(X_test.loc(axis=1)["customer_occupation_code", "customer_children", "customer_relationship"])
X_test.loc(axis=1)[["customer_since_all_years", "customer_since_bank_years"]] = imp_mean.transform(X_test.loc(axis=1)[["customer_since_all_years", "customer_since_bank_years"]])

In [None]:
X_test['customer_gender'] = X_test['customer_gender'].replace([1,2],[0,1])
X_test['customer_relationship'] = X_test['customer_relationship'].replace(['single','couple'],[0,1])

In [None]:
occupation_codes = [f"is_occupation_code_{i}" for i in pd.get_dummies(X_test['customer_occupation_code']).columns]
X_test[occupation_codes] = pd.get_dummies(X_test['customer_occupation_code'])

customer_children_codes = [f"is_customer_children_{i}" for i in pd.get_dummies(X_test['customer_children']).columns]
X_test[customer_children_codes] = pd.get_dummies(X_test['customer_children'])

visitis_distinct_codes = [f"is_visits_distinct_so_{i}" for i in pd.get_dummies(X_test['visits_distinct_so']).columns]
X_test[visitis_distinct_codes] = pd.get_dummies(X_test['visits_distinct_so'])

visitis_distinct_areas_codes = [f"is_visits_distinct_areas_{i}" for i in pd.get_dummies(X_test['visits_distinct_so_areas']).columns]
X_test[visitis_distinct_areas_codes] = pd.get_dummies(X_test['visits_distinct_so_areas'])

In [None]:
X_test.drop(['customer_occupation_code', 'customer_children', 'visits_distinct_so', 'visits_distinct_so_areas'], axis=1, inplace=True)

In [None]:
# bal_insurance = bal_insurance_21 + bal_insurance_23
# X_test["bal_insurance"] = X_test["bal_insurance_21"] + X_test["bal_insurance_23"]

# X_test["bal_loan"] = X_test["bal_mortgage_loan"] + X_test["bal_personal_loan"]

# X_test["bal_starter"] = X_test["bal_savings_account_starter"] + X_test["bal_current_account_starter"]

# X_test["bal_money"] = X_test["bal_pension_saving"] + X_test["bal_savings_account"] + X_test["bal_current_account"]

# X_test["cap"] = X_test["cap_life_insurance_decreasing_cap"] + X_test["cap_life_insurance_fixed_cap"]

In [None]:
# X_test.drop(['bal_insurance_21', 'bal_insurance_23', 'bal_mortgage_loan', 'bal_personal_loan', 'bal_savings_account_starter'
#              ,'bal_current_account_starter','cap_life_insurance_decreasing_cap','cap_life_insurance_fixed_cap', 'bal_pension_saving', 'bal_savings_account', 'bal_current_account'], axis=1, inplace=True)

In [None]:
# perform a uniform discretization transform of the dataset

X_test[columns] = kb1.transform(X_test[columns])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
#clf1 ---> without class_weight
clf1 = LogisticRegression(random_state=0,solver='liblinear', penalty='l2', max_iter=1000).fit(X_train, y_train)

#clf2 ---> with class_weight 
clf2 = LogisticRegression(C=1, random_state=0,solver='liblinear', penalty='l2', max_iter=1000, class_weight='balanced').fit(X_train, y_train)

#clf3 = SVC(class_weight='balanced', C = 100, probability=True, kernel='rbf',verbose=True).fit(X_train, y_train)

In [None]:
X_test = X_test.reindex(columns = X_train.columns, fill_value=0)

In [None]:
# making predictions with both of the classifiers

y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
#y_pred3 = clf3.predict(X_test)

In [None]:
from sklearn import metrics

print("For classifier one: ")

print("Accuracy = ", metrics.accuracy_score(y_test, y_pred1))
print("Precision = ", metrics.precision_score(y_test, y_pred1, average='binary'))
print("Recall = ",metrics.recall_score(y_test, y_pred1))
print("F1 = ", metrics.f1_score(y_test, y_pred1))
print("auc = ", metrics.roc_auc_score(y_test, clf1.predict_proba(X_test)[:, 1]))

print()
print("For classifier two: ")

print("Accuracy = ", metrics.accuracy_score(y_test, y_pred2))
print("Precision = ", metrics.precision_score(y_test, y_pred2, average='binary', labels=1))
print("Recall = ",metrics.recall_score(y_test, y_pred2))
print("F1 = ", metrics.f1_score(y_test, y_pred2))
print("Auc = ", metrics.roc_auc_score(y_test, clf2.predict_proba(X_test)[:, 1]))

print(metrics.classification_report(y_test, y_pred2))

# print("Accuracy = ", metrics.accuracy_score(y_test, y_pred3))
# print("Precision = ", metrics.precision_score(y_test, y_pred3, average='binary', labels=1))
# print("Recall = ",metrics.recall_score(y_test, y_pred3))
# print("F1 = ", metrics.f1_score(y_test, y_pred3, average='micro'))
# print("Auc = ", metrics.roc_auc_score(y_test, clf3.predict_proba(X_test)[:, 1]))

# print(metrics.classification_report(y_test, y_pred3))

In [None]:
plt.figure(figsize=(10,8))
cm = metrics.confusion_matrix(y_test, y_pred1)
sns.heatmap(cm, annot=True, annot_kws={"size": 16}) # font size
plt.title("Logistic Regression without class weighting")
print(cm)

In [None]:
plt.figure(figsize=(10,8))
cm = metrics.confusion_matrix(y_test, y_pred2)
sns.heatmap(cm, annot=True, annot_kws={"size": 16}) # font size
plt.title("Logistic Regression with class weighting")
print(cm)

### Model selection

In [None]:
# from sklearn.model_selection import GridSearchCV
# parameters = [{'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']},
#               {'penalty':['none', 'elasticnet', 'l1', 'l2']},
#               {'C':[0.001, 0.01, 0.1, 1, 10, 100]},
#              {'class_weight':['balanced', {"0":0.1,"1":0.9},{"0":0.05,"1":0.995}]},
#               {"max_iter":[100,1000]}
#              ]

# grid_search = GridSearchCV(estimator = clf2,  
#                            param_grid = parameters,
#                            cv = 5,
#                            verbose=1)


# grid_search.fit(X_train, y_train)

# Test Data Results

In [None]:
test_data = pd.read_csv('/home/omar/Documents/MAI/Semester2/Advanced-analytics/data/test_month_3.csv')
client_ids = test_data["client_id"]

test_data["customer_birth_date"] = pd.to_datetime(test_data["customer_birth_date"])
test_data["age"] = test_data["customer_birth_date"].apply(lambda x : (pd.datetime.now().year - x.year))

test_data["customer_since_all"] = pd.to_datetime(test_data["customer_since_all"])
test_data["customer_since_all_years"] = test_data["customer_since_all"].apply(lambda x : (pd.datetime.now().year - x.year))

test_data["customer_since_bank"] = pd.to_datetime(test_data["customer_since_bank"])
test_data["customer_since_bank_years"] = test_data["customer_since_bank"].apply(lambda x : (pd.datetime.now().year - x.year))

## drop customer_since_all, customer_since_bank, customer_birth_date

test_data.drop(['customer_since_all', 'customer_since_bank', 'customer_birth_date', 'customer_postal_code'], axis=1, inplace=True)

## drop customer_education
test_data.drop(['customer_education'], axis=1, inplace=True)

test_data.loc(axis=1)["customer_occupation_code", "customer_children", "customer_relationship"] = imp_frquent.transform(test_data.loc(axis=1)["customer_occupation_code", "customer_children", "customer_relationship"])
test_data.loc(axis=1)[["customer_since_all_years", "customer_since_bank_years"]] = imp_mean.transform(test_data.loc(axis=1)[["customer_since_all_years", "customer_since_bank_years"]])

test_data['customer_gender'] = test_data['customer_gender'].replace([1,2],[0,1])
test_data['customer_relationship'] = test_data['customer_relationship'].replace(['single','couple'],[0,1])

occupation_codes = [f"is_occupation_code_{i}" for i in pd.get_dummies(test_data['customer_occupation_code']).columns]
test_data[occupation_codes] = pd.get_dummies(test_data['customer_occupation_code'])

customer_children_codes = [f"is_customer_children_{i}" for i in pd.get_dummies(test_data['customer_children']).columns]
test_data[customer_children_codes] = pd.get_dummies(test_data['customer_children'])

visitis_distinct_codes = [f"is_visits_distinct_so_{i}" for i in pd.get_dummies(test_data['visits_distinct_so']).columns]
test_data[visitis_distinct_codes] = pd.get_dummies(test_data['visits_distinct_so'])

visitis_distinct_areas_codes = [f"is_visits_distinct_areas_{i}" for i in pd.get_dummies(test_data['visits_distinct_so_areas']).columns]
test_data[visitis_distinct_areas_codes] = pd.get_dummies(test_data['visits_distinct_so_areas'])

test_data.drop(['customer_occupation_code', 'customer_children', 'visits_distinct_so', 'visits_distinct_so_areas'], axis=1, inplace=True)

# perform a uniform discretization transform of the dataset

test_data[columns] = kb1.transform(test_data[columns])

test_data = test_data.reindex(columns = X_train.columns, fill_value=0)

#clf2.fit(pd.concat([X_train,X_test]),pd.concat([y_train,y_test]))

y_final_test = clf2.predict(test_data)

from collections import Counter
print(Counter(y_final_test))

scores = pd.DataFrame({'id':client_ids, "scores":clf2.predict_proba(test_data)[:,1]})

scores.to_csv('scores_try2.csv',header=False, index=False)

In [None]:
# Counter({0: 18624, 1: 8676})