In [2]:
from random import randint
import numpy as np
from scipy.stats import pearsonr
import pandas as pd
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
import sklearn

## Pre-processing

In [None]:
df = pd.ExcelFile('Campus+Challenge+-+Data+and+Metadata.xlsx')
df

In [None]:
dataframe = df.parse('Data')
dataframe

Each cust_num has data for 12 months - month 1 to month 12. So, 10000 unique customers, and total rows = 12 x 10000 = 120000

In [None]:
#Scoping out the dataset
print('Min month:',dataframe.month.min(),' Max month:',dataframe.month.max())
print('Min balance:',dataframe.normal_tot_bal.min(),'Max balance:',dataframe.normal_tot_bal.max())

In [None]:
# Sort by cust_num, month. This will help us in looking for patterns
sorted_dataframe = dataframe.sort_values(by=['cust_num','month'],ascending=True)
sorted_dataframe = sorted_dataframe.reset_index()
sorted_dataframe.drop('index',1)

###### Creating the 'change' column that tracks changes in total balance

In [None]:
#Read Balance subset data as first step towards creating the new 'change' column
balance_subset = pd.read_csv('balance_subset.csv')
change_dataframe = balance_subset[['cust_num','change_1','change_2','change_3','change_4','change_5','change_6','change_7','change_8',
               'change_9','change_10','change_11']]

In [None]:
pd.DataFrame(sorted_dataframe, columns=['cust_num','normal_tot_bal'])

In [None]:
sorted_dataframe['change']=0.0

In [None]:
list_ = []
for i in range(len(sorted_dataframe)):
    if sorted_dataframe['month'][i]==12:
        list_.append(0.0)
        
    else:
        list_.append(sorted_dataframe['normal_tot_bal'][i+1]-sorted_dataframe['normal_tot_bal'][i])

In [None]:
sorted_dataframe['change'] = pd.Series(list_)

In [None]:
pd.DataFrame(sorted_dataframe, columns=['cust_num','normal_tot_bal','change'])

###### Discretising the 'change' column to give 1 for an increase and 0 for a decrease

In [None]:
# sorted_dataframe['step_balance'] = 0
step_list = []
for i in range(len(sorted_dataframe)):
    if sorted_dataframe['change'][i] > 0:
        step_list.append(1)
    else:
        step_list.append(0)
sorted_dataframe['step_balance'] = pd.Series(step_list)        

In [None]:
np.random.seed(0)
size=120000
x = np.random.normal(0,10000, size)
sorted_dataframe['random']=pd.Series(x)

# Run prediction models on entire dataset

In [5]:
#Loading the processed data set into a pandas dataframe
dataframe_entire = pd.read_csv('clean_data1.csv')

#separating the set into training and testing sets 
growth_train,growth_dev = dataframe_entire[:10000],dataframe_entire[10000:12000]
train_X = growth_train[['wf_outreach_flag_chan_ii','wf_outreach_flag_chan_ii','cust_outreach_aii','cust_outreach_ai','typeA_ct','cust_outreach_avi','wf_outreach_flag_chan_iv','wf_outreach_flag_chan_i']]
test_X = growth_train['step_balance']
train_Y = growth_dev[['wf_outreach_flag_chan_ii','wf_outreach_flag_chan_ii','cust_outreach_aii','cust_outreach_ai','typeA_ct','cust_outreach_avi','wf_outreach_flag_chan_iv','wf_outreach_flag_chan_i']]
test_Y  = growth_dev['step_balance']

###### Running the model for the entire training set 

In [8]:
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_X,test_X)
print ("The accuracy of the model is: " + str(forest.score(train_Y, test_Y)))

The accuracy of the model is: 0.558


### Working with the "customer_demographic_ai = 1, customer_demographic_aii = 1" sub-dataset

In [None]:
dataframe = pd.read_csv('1_1.csv')

###### Locating the features most correlated with the output for this sub-dataset, using the Pearson Coefficient method

In [None]:
for i in dataframe.columns:
    corr = pearsonr(dataframe[i], dataframe['step_balance'])
    print("Correlation" + " " + i + "=" + str(corr))

#### Once we locate the top 5-10 features, we utilize training algorithms like Random Forest, Recurrent Neural Networks and SVM to train a model with the said features. We found that Random Forest gives us the best prediction accuracies, and thus focussed on this method.

In [None]:
#We use a training set and a test set. We also utilised different k-fold cross-validation techniques to reduce biases.
growth_train = dataframe[:7000]
growth_dev = dataframe[7000:7715]


In [None]:
train_X = growth_train[['typeA_ct','typeA_bal_cat','wf_outreach_flag_chan_ii','typeF_flag','typeB_bal_cat','typeB_ct','cust_outreach_avi','wf_outreach_flag_chan_i','cust_outreach_ai','cust_outreach_av','cust_outreach_aii','wf_outreach_flag_chan_iv']]

In [None]:
test_X = growth_train['step_balance']

In [None]:
train_Y = growth_dev[['typeA_ct','typeA_bal_cat','wf_outreach_flag_chan_ii','typeF_flag','typeB_bal_cat','typeB_ct','cust_outreach_avi','wf_outreach_flag_chan_i','cust_outreach_ai','cust_outreach_av','cust_outreach_aii','wf_outreach_flag_chan_iv']]
test_Y  = growth_dev['step_balance']

#### Implementation of Random Forest

In [None]:
forest = RandomForestClassifier(n_estimators = 100)

# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(train_X,test_X)

# Take the same decision trees and run it on the test data
output = forest.predict(train_Y)
forest.score(list(output), list(test_Y))

    

###### Implementation of SVM technique using rbf kernel as well as linear kernel

In [None]:
clf = SVC(kernel='rbf')
clf.fit(train_X,test_X)
clf.score(train_Y, test_Y)

In [None]:
clf1 = SVC(kernel='linear')
clf1.fit(train_X,test_X)
clf1.score(train_Y, test_Y)