In [None]:
import os
os.getcwd()
%run import_library.ipynb
%run data_preprocessing.ipynb
%run Evaluation_functions.ipynb
%run categorical_data_encoding.ipynb
%run OneVsRestLightGBMWithCustomizedLoss.ipynb

In [None]:

demograph_train = pd.read_csv("demographics_train.csv")
exposure_train = pd.read_csv("exposure_train.csv")
transfers_train = pd.read_csv("transfers_train.csv")
transfers_train = transfers_train.dropna()
demograph_train.head()
print(transfers_train.shape)

In [None]:
# Read testing data
demograph_test = pd.read_csv("demographics_test.csv")
exposure_test = pd.read_csv("exposure_test.csv")
transfers_test = pd.read_csv("transfers_test.csv")
transfers_test = transfers_test.dropna()
demograph_test.head()
print(transfers_test.shape)

In [None]:
# target class distribution
demograph_train.groupby("level").count()
sns.countplot(demograph_train["level"])
sns.countplot(demograph_train["COUNTRY_CODE"])

In [None]:
#Observations
#Data is very skewed to US with only a few data points from other countries
#The data is of highly imbalanced classes with majority class being level b, d and c
#Models to test: Classifications model like Logistic regression, Random forest, Boosting algorithms, Tree based algorithms etc

#Data Cleanup
#1. Demographic data
#a) Age - calculate age from date
#b) Location - cleanup countries and states
#c) Occupation
#d) create column days to verify
#e) Drop rows that have NAs in country code and Birth year# Clean Demographic Train data

demograph_train_cleaned = cleanup_demographics(demograph_train)
demo_onehot_columns = [ 'COUNTRY_CODE',  'OCC_CAT']
demograph_train_cleaned = one_hot_ecoding(demograph_train_cleaned, demo_onehot_columns)
demograph_train_cleaned = demograph_train_cleaned.set_index("EXCHANGE_ACCOUNT_ID")
len(demograph_train_cleaned.columns)

# Clean Demographic Test data
demograph_test_cleaned = cleanup_demographics(demograph_train)
demograph_test_cleaned = one_hot_ecoding(demograph_test_cleaned, demo_onehot_columns)
demograph_test_cleaned = demograph_test_cleaned.set_index("EXCHANGE_ACCOUNT_ID")

In [None]:
#2.Exposure data
#a) Encode categorical data
#b) Aggregate exposure amounts on account_id and category    
exposure_one_hot_columns = 'cluster_category'
exposure_train_encoded = one_hot_ecoding(exposure_train, exposure_one_hot_columns)
exposure_train_agg = exposure_agg(exposure_train_encoded)

exposure_test_encoded = one_hot_ecoding(exposure_test, exposure_one_hot_columns)
exposure_test_agg = exposure_agg(exposure_test_encoded)

In [None]:
#3. Transfers data
#a) Cleanup data 
#b) Encode categorical data
#c) Aggregate transfer amounts on account_id, type, tx_time and currency 
transfers_train_cleaned = cleanup_transfers(transfers_train)
transfers_feature_cols = ["TX_YEAR", 'CURRENCY', "TYPE"]
transfers_train_encoded = one_hot_ecoding(transfers_train_cleaned, transfers_feature_cols)
transfers_train_agg = transfer_agg(transfers_train_encoded)

transfers_test_cleaned = cleanup_transfers(transfers_test)
transfers_test_encoded = one_hot_ecoding(transfers_test_cleaned, transfers_feature_cols)
transfers_test_agg = transfer_agg(transfers_test_encoded)

In [None]:
#Merge dataframes
Transaction_data_train = pd.merge(exposure_train_agg, transfers_train_agg, how='outer',left_index=True, right_index=True).fillna(0)
Transaction_data_test = pd.merge(exposure_test_agg, transfers_test_agg, how='outer',left_index=True, right_index=True).fillna(0)
training_data = pd.merge(demograph_train_cleaned, Transaction_data_train, how='inner',left_index=True, right_index=True).reset_index()
test_data = pd.merge(demograph_test_cleaned, Transaction_data_test, how='inner',left_index=True, right_index=True).reset_index()
col_to_drop = ['EXCHANGE_ACCOUNT_ID', 'CREATED_AT', 'FIRST_VERIFIED_AT', 'STATE_CODE', 'BIRTH_YEAR', 'OCCUPATION',
  'AGE_GROUPS', 'DAYS_TO_VERIFY_GROUPS', 'STATE_CODE_INT', 'COUNTRY_CODE_INT','TX_TIME']
training_data = training_data.drop(col_to_drop, axis=1)
test_data = test_data.drop(col_to_drop, axis=1)


In [None]:
#MODELS
# Define independent features
target = ['level']
features  =  list(training_data.columns)
features.remove('level')
training_data[features] = training_data[features].applymap(np.int64)

y = training_data[target]
X = training_data[features]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

dict_classifiers = {
    "Logreg": LogisticRegression(solver='lbfgs'),
    "NN": KNeighborsClassifier(),
    #"LinearSVM": SVC(probability=True, kernel='linear'), #class_weight='balanced'
    "LGB": lgb.LGBMClassifier(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "NB": GaussianNB()
}
classification_metrics = displaymetrics(dict_classifiers, X_train, X_val,y_train, y_val)
classification_metrics

In [None]:
rf_classifier = RandomForestClassifier(random_state= 42)
rf_classifier.fit(X_train, y_train)
y_val_pred = rf_classifier.predict(X_val)
pred_accuracy_score = accuracy_score(y_val, y_val_pred)
pred_recall_score = recall_score(y_val, y_val_pred, average='macro')
print('Prediction accuracy', pred_accuracy_score,' recall ', pred_recall_score)


cnf_matrix = confusion_matrix(y_val, y_val_pred)

plot_confusion_matrix(rf_classifier,X_val, y_val)
plt.grid(False)


# Find feature importance through Random forest classifier
features = X_train.columns
importances = rf_classifier.feature_importances_
indices = np.argsort(importances)

# customized number 
num_features = 10 

plt.figure(figsize=(10,5))
plt.title('Feature Importances')

# only plot the customized number of features
plt.barh(range(num_features), importances[indices[-num_features:]], color='b', align='center')
plt.yticks(range(num_features), [features[i] for i in indices[-num_features:]])
plt.xlabel('Relative Importance')
plt.show()
relevant_features = [features[i] for i in indices[-num_features:]]
relevant_features

X_train = X_train[relevant_features]
X_val = X_val[relevant_features]


clf_lgb = lgb.LGBMClassifier()
clf_lgb.fit(X_train, y_train)
y_val_pred = clf_lgb.predict(X_val)
pred_accuracy_score = accuracy_score(y_val, y_val_pred)
pred_recall_score = recall_score(y_val, y_val_pred, average='macro')
print('Prediction accuracy', pred_accuracy_score,' recall ', pred_recall_score)


cnf_matrix = confusion_matrix(y_val, y_val_pred)

plot_confusion_matrix(clf_lgb,X_val, y_val)
plt.grid(False)

In [None]:
#For the purpose of improving the models, we will try a few techniques on the LGBClassifer model as it has a better recall and almost same accuracy as Randomforest
#As we can see the data is a bit challenging to classify correctly majorily due to the imbalanced classes. We will try techniques and see if this can improve the model like:
#Resample Classes - One approach to addressing the problem of class imbalance is to randomly resample the training dataset. The two main approaches to randomly resampling an imbalanced dataset are to delete examples from the majority class, called undersampling, and to duplicate examples from the minority class, called oversampling. Since, We donot want to lose more data so we will try to over-sample the minority class and see if that would improve the model
#Add class weights - Give minoprity class more weights than the majority class
#OneOverRestclassifiers - In this aaproach we fit one classifier per class. For each classifier, the class is fitted against all the other classes. This startegy is good for cases where we need interpretability as each class is represented by one and only one classifier which makes it easy to gain knowledge about the class by inspecting its corresponding classifier.
#1. Over-sampling the minority class
print("Training and validation data before sampling: ", X_train.shape, y_train.shape)

# using SMOTE to oversample the minority class
smote = SMOTE(sampling_strategy ='minority', k_neighbors =2)
x_sm, y_sm = smote.fit_resample(X_train, y_train)

print("Training and validation data after sampling: ", x_sm.shape, y_sm.shape)

# Using the best model from the group
dict_classifiers = {"LGB": lgb.LGBMClassifier()}
classification_metrics_resampled =  displaymetrics(dict_classifiers, x_sm, X_val,y_sm, y_val)
classification_metrics_resampled

##2. Adding class weights
dict_classifiers = {"LGB": lgb.LGBMClassifier(class_weight = 'balanced')}
classification_metrics_resampled_weights =  displaymetrics(dict_classifiers, X_train, X_val, y_train, y_val)
classification_metrics_resampled_weights

#3. OneoverRestclassifiers
from sklearn.multiclass import OneVsRestClassifier
ovr = OneVsRestClassifier(lgb.LGBMClassifier( boosting_type='gbdt',  num_leaves=30, max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=225, 
  objective=None, min_split_gain=0, 
 min_child_weight=5, 
 min_child_samples=10, subsample=1, subsample_freq=1, 
colsample_bytree=1, 
reg_alpha=1, reg_lambda=0, seed=410,  silent=True))
   
# Fitting the model with training data
ovr.fit(x_sm, y_sm)
   
# Making a prediction on the test set
prediction = ovr.predict(X_val)
   
# Evaluating the model
print(f"Validation Set Accuracy : {accuracy_score(y_val, prediction) * 100} %\n\n")
print(f"Classification Report : \n\n{classification_report(y_val, prediction)}")

In [None]:
#Hyperparameter Tuning
# n_estimater:number of boosting iterations, 
# Note: internally, LightGBM constructs num_class * num_iterations trees for multi-class classification problems
# n_leaves: max number of leaves in one tree
# added L1 regularization too as the model has been over fitting for training samples


param_set = {
 'n_estimators':[100,150,200]
    , 'num_leaves':range(20,60,10)
}

gsearch = GridSearchCV(estimator = lgb.LGBMClassifier( boosting_type='gbdt', multiclass = "softmax", num_class = 9,  num_leaves=30, max_depth=5, learning_rate=0.1, n_estimators=100, max_bin=225, 
  objective=None, min_split_gain=0, 
 min_child_weight=5, 
 min_child_samples=10, subsample=1, subsample_freq=1, 
colsample_bytree=1, 
reg_alpha=1, reg_lambda=0, seed=410, nthread=7, silent=True), 
param_grid = param_set, scoring='f1_macro',n_jobs=7, cv=10)

lgb_model2 = gsearch.fit(x_sm, y_sm)
lgb_model2.best_estimator_, lgb_model2.best_score_
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(lgb_model2, open(filename, 'wb'))

In [None]:
#Load the saved model and use it to run on the test data provided
#make sure features are available in the test data
test_features = [relevant_features for relevant_features in test_data.columns if relevant_features in test_data.columns]
test_features.remove('level')
test_data_full = pd.DataFrame(columns = relevant_features)
test_data_final = pd.concat([ test_data_full, test_data],  join="outer").fillna(0)
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
y_test_pred = loaded_model.predict(test_data_final[relevant_features])