In [1]:
import pandas as pd
import os
from time import time
from IPython.display import display # Allows the use of display() for DataFrames
import numpy as np
# Import supplementary visualization code visuals.py
#import visuals as vs

# Pretty display for notebooks
%matplotlib inline

 


### Setting files and directories

In [46]:
path_to_data= "/home/ubuntu/udacity/CodeGladiator/invesco/data"

transaction_file = "Code-Gladiators-Transaction.csv"
#investment_exp_file = "Code-Gladiators-InvestmentExperience.csv"
investment_exp_file = "imputed_investment_exp.csv"
investment_segment = "investment_vehicle_segment.csv"
aum_file = "Code-Gladiators-AUM.csv"
activity_file = "Code-Gladiators-Activity.csv"

test_file = "test_data.csv"



### reading csv files into pandas dataframe

In [47]:
transaction_df = pd.read_csv(os.path.join(path_to_data, transaction_file))
investment_exp_df = pd.read_csv(os.path.join(path_to_data, investment_exp_file))
investment_segment_df = pd.read_csv(os.path.join(path_to_data, investment_segment))
aum_df = pd.read_csv(os.path.join(path_to_data, aum_file))
activity_df = pd.read_csv(os.path.join(path_to_data, activity_file))
test_df = pd.read_csv(os.path.join(path_to_data,test_file))

In [48]:
column_list= [
    'Transaction_Type',
#    'Shares_investor',
#    'AUM_investor',
     'Counts_investor',
#    'Counts_advisor',
#	'Shares_advisor',
#	'AUM_advisor',
#	'Morningstar Category',
#	'Investment',
	'Rating',
	'1 Yr % Rank',
	'3 Yr % Rank',
#	'5 Yr % Rank',
#	'10 Yr % Rank',
	'1 Yr Return',
	'3 Yr Return',
#	'5 Yr Return',
#	'10 Yr Return',
#	'1 Yr Excess Return vs Primary Ix',
#	'3 Yr Excess Return vs Primary Ix',
#	'5 Yr Excess Return vs Primary Ix',
#	'10 Yr Excess Return vs Primary Ix',
#	'1 Yr Excess Return vs Category Ix',
#	'3 Yr Excess Return vs Category Ix',
#	'5 Yr Excess Return vs Category Ix',
#	'10 Yr Excess Return vs Category Ix',
	'Net Flows',
	'Morningstar_Category_Rating',
#	'investment_vehicle_segment',
	'AUM_investor_log',
	'Shares_investor_log',
	'AUM_advisor_log',
	'Shares_advisor_log',
	]


### processing data

Grouping data by unique advisor id and month. Final output will store sums of the assets under managements and shares for each advisor in particular month

In [49]:
grouped_advisor_aum_df = aum_df.groupby(['Unique_Advisor_Id','Month']).agg({'AUM': 'sum','Shares':'sum','Unique_Investment_Id':'count'}).reset_index().rename(columns={'Unique_Investment_Id':'Counts'})
grouped_investment_aum_df = aum_df.groupby(['Unique_Investment_Id','Month']).agg({'AUM': 'sum','Shares':'sum','Unique_Advisor_Id':'count'}).reset_index().rename(columns={'Unique_Advisor_Id':'Counts'})

grouped_investment_aum_df['Year'],grouped_investment_aum_df['Month']=grouped_investment_aum_df['Month'].str.split(' /', 1).str
grouped_advisor_aum_df['Year'],grouped_advisor_aum_df['Month']=grouped_advisor_aum_df['Month'].str.split(' /', 1).str

transaction_df['Year'],transaction_df['Month']=transaction_df['Month'].str.split(' /', 1).str
investment_exp_df['Year'],investment_exp_df['Month']=investment_exp_df['Month'].str.split(' /', 1).str
aum_df['Year'],aum_df['Month']=aum_df['Month'].str.split(' /', 1).str
activity_df['Year'],activity_df['Month']=activity_df['Month'].str.split(' /', 1).str

grouped_investment_aum_df['Mapping_Month']= grouped_investment_aum_df['Month'].astype(int).apply(lambda x : x+1)
grouped_advisor_aum_df['Mapping_Month']= grouped_advisor_aum_df['Month'].astype(int).apply(lambda x : x+1)
investment_exp_df['Mapping_Month']= investment_exp_df['Month'].astype(int).apply(lambda x : x+1)
aum_df['Mapping_Month']= aum_df['Month'].astype(int).apply(lambda x : x+1)
activity_df['Mapping_Month']= activity_df['Month'].astype(int).apply(lambda x : x+1)

transaction_df['Month'] =  transaction_df['Month'].astype(int).apply(lambda x : x+0)

final_transaction = pd.merge(transaction_df, grouped_investment_aum_df, left_on=["Month","Unique_Investment_Id"],right_on=["Mapping_Month","Unique_Investment_Id"], how="left") 
final_transaction = final_transaction.rename(columns={'Month_x': 'Month', 'AUM': 'AUM_investor','Year_x' : 'Year', 'Counts' : 'Counts_investor', 'Shares': 'Shares_investor','Month_y':'Month_actual'})
final_transaction = final_transaction.drop('Year_y', 1)

test_transaction = pd.merge(test_df, grouped_investment_aum_df[grouped_investment_aum_df["Mapping_Month"]== 13], on="Unique_Investment_Id", how="left")
test_transaction = test_transaction.rename(columns={ 'AUM': 'AUM_investor','Counts' : 'Counts_investor', 'Shares': 'Shares_investor'})

final_transaction = pd.merge(final_transaction, grouped_advisor_aum_df, left_on=["Month","Unique_Advisor_Id"],right_on=["Mapping_Month","Unique_Advisor_Id"], how="left") 
final_transaction = final_transaction.rename(columns={'Month_x': 'Month', 'AUM': 'AUM_advisor','Year_x' : 'Year', 'Counts' : 'Counts_advisor', 'Shares': 'Shares_advisor','Mapping_Month_x': 'Mapping_Month'})
final_transaction = final_transaction.drop(['Year_y','Mapping_Month_y','Month_y'], 1)

test_transaction = pd.merge(test_transaction, grouped_advisor_aum_df[grouped_advisor_aum_df["Mapping_Month"]== 13], on="Unique_Advisor_Id", how="left")
test_transaction = test_transaction.rename(columns={ 'AUM': 'AUM_advisor','Counts' : 'Counts_advisor', 'Shares': 'Shares_advisor','Month_x':'Month','Mapping_Month_x':'Mapping_Month'})
test_transaction = test_transaction.drop(['Year_x','Year_y','Mapping_Month_y','Month_y','Mapping_Month'], 1)

investment_exp_df['investment_vehicle_segment']= investment_segment_df['investment_vehicle_segment']
investment_exp_df= investment_exp_df[investment_exp_df['Year']=='2016']

final_transaction_with_exp = pd.merge(final_transaction, investment_exp_df, left_on=["Month","Unique_Investment_Id"],right_on=["Mapping_Month","Unique_Investment_Id"], how="left") 
final_transaction_with_exp = final_transaction_with_exp.rename(columns={'Month_x': 'Month', 'AUM': 'AUM_advisor','Year_x' : 'Year', 'Mapping_Month_x':'Mapping_Month'})
final_transaction_with_exp = final_transaction_with_exp.drop(['Year_y','Mapping_Month_y','Month_y'], 1)

test_transaction_with_exp = pd.merge(test_transaction, investment_exp_df[investment_exp_df["Mapping_Month"]== 13 ], on="Unique_Investment_Id", how="left")

'''
final_transaction_with_exp['AUM_investor_log'] = np.log(final_transaction_with_exp['AUM_investor'])
final_transaction_with_exp['Shares_investor_log'] = np.log(final_transaction_with_exp['Shares_investor'])

final_transaction_with_exp['AUM_advisor_log'] = np.log(final_transaction_with_exp['AUM_advisor'])
final_transaction_with_exp['Shares_advisor_log'] = np.log(final_transaction_with_exp['Shares_advisor'])


test_transaction_with_exp['AUM_investor_log'] = np.log(test_transaction_with_exp['AUM_investor'])
test_transaction_with_exp['Shares_investor_log'] = np.log(test_transaction_with_exp['Shares_investor'])

test_transaction_with_exp['AUM_advisor_log'] = np.log(test_transaction_with_exp['AUM_advisor'])
test_transaction_with_exp['Shares_advisor_log'] = np.log(test_transaction_with_exp['Shares_advisor'])
'''
final_transaction_with_exp['AUM_investor_log'] = np.log(final_transaction_with_exp['AUM_investor']/final_transaction_with_exp['Counts_investor'])
final_transaction_with_exp['Shares_investor_log'] = np.log(final_transaction_with_exp['Shares_investor']/final_transaction_with_exp['Counts_investor'])

final_transaction_with_exp['AUM_advisor_log'] = np.log(final_transaction_with_exp['AUM_advisor']/final_transaction_with_exp['Counts_advisor'])
final_transaction_with_exp['Shares_advisor_log'] = np.log(final_transaction_with_exp['Shares_advisor']/final_transaction_with_exp['Counts_advisor'])


test_transaction_with_exp['AUM_investor_log'] = np.log(test_transaction_with_exp['AUM_investor']/test_transaction_with_exp['Counts_investor'])
test_transaction_with_exp['Shares_investor_log'] = np.log(test_transaction_with_exp['Shares_investor']/test_transaction_with_exp['Counts_investor'])

test_transaction_with_exp['AUM_advisor_log'] = np.log(test_transaction_with_exp['AUM_advisor']/test_transaction_with_exp['Counts_advisor'])
test_transaction_with_exp['Shares_advisor_log'] = np.log(test_transaction_with_exp['Shares_advisor']/test_transaction_with_exp['Counts_advisor'])
final_transaction_with_exp= final_transaction_with_exp.dropna()


required_train_df = final_transaction_with_exp.filter(column_list)

required_train_df['Transaction_Type']= required_train_df.apply(lambda x: 0 if x['Transaction_Type']== 'P' else 1, axis=1)

required_test_df = test_transaction_with_exp.filter(column_list)

required_test_df['Rating'] = required_test_df['Rating'].astype(float)

required_test_df=required_test_df.fillna(required_test_df.median())

# Split the data into features and target label
transaction_type = required_train_df['Transaction_Type']
features_raw = required_train_df.drop('Transaction_Type', axis = 1)
test_raw = required_test_df




In [50]:
from sklearn.preprocessing import LabelEncoder
var_mod = column_list
var_mod.remove('Transaction_Type')
le = LabelEncoder()
for i in var_mod:
    features_raw[i] = le.fit_transform(features_raw[i])
    test_raw[i] = le.fit_transform(test_raw[i])
features_raw.dtypes 

Counts_investor                int64
Rating                         int64
1 Yr % Rank                    int64
3 Yr % Rank                    int64
1 Yr Return                    int64
3 Yr Return                    int64
Net Flows                      int64
Morningstar_Category_Rating    int64
AUM_investor_log               int64
Shares_investor_log            int64
AUM_advisor_log                int64
Shares_advisor_log             int64
dtype: object

In [51]:
# Import train_test_split
from sklearn.cross_validation import train_test_split

# Split the 'features' and 'transaction_type' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_raw, transaction_type, test_size = 0.2, random_state = 0)

# Show the results of the split
print ("Training set has {} samples.".format(X_train.shape[0]))
print ("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 100161 samples.
Testing set has 25041 samples.


In [52]:
# TODO: Import two metrics from sklearn - fbeta_score and accuracy_score
from sklearn.metrics import fbeta_score, accuracy_score
# time.time() return time in seconds since the Epoch
from time import time

beta = 0.5

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # TODO: Fit the learner to the training data using slicing with 'sample_size'
    X_train = X_train[:sample_size]
    y_train = y_train[:sample_size]
    
    start = time() # Get start time
    learner.fit(X_train, y_train)
    end = time() # Get end time
    
    # TODO: Calculate the training time
    results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set,
    #       then get predictions on the first 300 training samples
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train)
    end = time() # Get end time
    
    # TODO: Calculate the total prediction time
    results['pred_time'] = end-start
            
    # TODO: Compute accuracy on the first 300 training samples
    results['acc_train'] = accuracy_score(y_train, predictions_train)
        
    # TODO: Compute accuracy on test set
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    # TODO: Compute F-score on the the first 300 training samples
    results['f_train'] = fbeta_score(y_train, predictions_train, beta=beta)
        
   # TODO: Compute F-score on the test set
    results['f_test'] = fbeta_score(y_test, predictions_test, beta=beta)
       
    # Success
    print ("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
        
    # Return the results
    return results

In [53]:
# TODO: Import the three supervised learning models from sklearn
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# TODO: Initialize the three models
clf_A = LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)
clf_B = LinearSVC(random_state=101)
clf_C = GaussianNB()
clf_Ada = AdaBoostClassifier()
clf_Grad = GradientBoostingClassifier()
clf_KNN = KNeighborsClassifier()
clf_Dec = DecisionTreeClassifier()
clf_SGD = SGDClassifier()


# TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data
n_train = len(y_train)
samples_1 = int(n_train * 0.01)
samples_10 = int(n_train * 0.1)
samples_100 = n_train

# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C, clf_Ada, clf_Grad,clf_KNN ,clf_Dec, clf_SGD]:
#for clf in [clf_A]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_100]):
        results[clf_name][i] = \
        train_predict(clf, samples, X_train, y_train, X_test, y_test)



LogisticRegression trained on 100161 samples.
LinearSVC trained on 100161 samples.
GaussianNB trained on 100161 samples.
AdaBoostClassifier trained on 100161 samples.
GradientBoostingClassifier trained on 100161 samples.
KNeighborsClassifier trained on 100161 samples.
DecisionTreeClassifier trained on 100161 samples.
SGDClassifier trained on 100161 samples.


In [54]:
display(results)

# Run metrics visualization for the three supervised learning models chosen
#vs.evaluate(results, accuracy, fscore)

{'AdaBoostClassifier': {0: {'acc_test': 0.67732918014456289,
   'acc_train': 0.67626121943670692,
   'f_test': 0.72206473079911981,
   'f_train': 0.72203431607487656,
   'pred_time': 0.7326390743255615,
   'train_time': 3.120283603668213}},
 'DecisionTreeClassifier': {0: {'acc_test': 0.63963100515155147,
   'acc_train': 0.80680105030900251,
   'f_test': 0.71035133965456199,
   'f_train': 0.84902340602564552,
   'pred_time': 0.03689885139465332,
   'train_time': 0.6272737979888916}},
 'GaussianNB': {0: {'acc_test': 0.6622339363443952,
   'acc_train': 0.66161480017172347,
   'f_test': 0.71751339864320496,
   'f_train': 0.71773932186663325,
   'pred_time': 0.024570703506469727,
   'train_time': 0.03567075729370117}},
 'GradientBoostingClassifier': {0: {'acc_test': 0.67968531608162608,
   'acc_train': 0.68132306985752933,
   'f_test': 0.7237094213609242,
   'f_train': 0.72553977825325811,
   'pred_time': 0.16945862770080566,
   'train_time': 6.698094606399536}},
 'KNeighborsClassifier': {0

In [163]:
clf = GradientBoostingClassifier()

In [None]:
# TODO: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer

# TODO: Initialize the classifier
clf = LogisticRegression()

# TODO: Create the parameters list you wish to tune
parameters = {'solver': ['newton-cg', 'lbfgs', 'sag'],
              'C': [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
              'random_state': [None, 101, 20160101]}

# TODO: Make an fbeta_score scoring object
scorer = make_scorer(fbeta_score, beta=beta)

# TODO: Perform grid search on the classifier using 'scorer' as the scoring method
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_fit = grid_obj.fit(X_train, y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print ("Unoptimized model\n------")
print ("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print ("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print ("\nOptimized Model\n------")
print ("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print ("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))

# show best parameters
print ("\nBest Classifier\n------")
print (best_clf)



In [20]:
from sklearn.externals import joblib

best_clf= clf_Grad
filename = 'logistic_regression_best_invesco.joblib.pkl'

_ = joblib.dump(best_clf, filename, compress=9)

In [21]:
print(best_clf)

clf_loaded = joblib.load(filename)

print(clf_loaded)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)


In [27]:

pred2 = clf_loaded.predict(X_test)
pred2[:5]

array([1, 1, 1, 0, 1])

In [38]:


pred = clf_loaded.predict(test_raw)
pred_prob = clf_loaded.predict_proba(test_raw)



In [42]:
pred[:5]

array([0, 0, 0, 0, 0])

In [40]:
pred_prob[:,1]

array([ 0.80651281,  0.83548749,  0.69778169, ...,  0.78739108,
        0.71228081,  0.67566846])

In [41]:
pred_prob = pd.DataFrame(pred_prob[:,0],columns=["Propensity_Score"])
pred_prob.head()

Unnamed: 0,Propensity_Score
0,0.806513
1,0.835487
2,0.697782
3,0.734473
4,0.769312


In [25]:
pred_df= pd.DataFrame(pred,columns=["Redeem_Status"])

pred_df=pred_df.replace([0,1],['NO','YES'])
pred_df.head()

pred_df['Redeem_Status'].value_counts()

NO     8313
YES     401
Name: Redeem_Status, dtype: int64

In [17]:
result = pd.concat([test_df, pred_prob, pred_df], axis=1)

result.head()


Unnamed: 0,Unique_Advisor_Id,Unique_Investment_Id,Propensity_Score,Redeem_Status
0,1000103,14147,0.073593,NO
1,1000103,3534,0.071945,NO
2,1000103,3651,0.362207,NO
3,1000103,7668,0.079415,NO
4,1000103,9339,0.003015,NO


In [47]:
result.to_csv('test_data_v2-2.csv',index=False)