In [2]:
import pandas as pd
import os
from time import time
from IPython.display import display # Allows the use of display() for DataFrames

# Import supplementary visualization code visuals.py
#import visuals as vs

# Pretty display for notebooks
%matplotlib inline

 


### Setting files and directories

In [4]:
path_to_data= "/home/ubuntu/udacity/CodeGladiator/invesco/data"

transaction_file = "Code-Gladiators-Transaction.csv"
investment_exp_file = "Code-Gladiators-InvestmentExperience.csv"
aum_file = "Code-Gladiators-AUM.csv"
activity_file = "Code-Gladiators-Activity.csv"

test_file = "test_data.csv"


### reading csv files into pandas dataframe

In [5]:
transaction_df = pd.read_csv(os.path.join(path_to_data, transaction_file))
investment_exp_df = pd.read_csv(os.path.join(path_to_data, investment_exp_file))
aum_df = pd.read_csv(os.path.join(path_to_data, aum_file))
activity_df = pd.read_csv(os.path.join(path_to_data, activity_file))
test_df = pd.read_csv(os.path.join(path_to_data,test_file))

In [None]:
test_df.head()

### processing data

Grouping data by unique advisor id and month. Final output will store sums of the assets under managements and shares for each advisor in particular month

In [6]:
grouped_advisor_aum_df = aum_df.groupby(['Unique_Advisor_Id','Month']).agg({'AUM': 'sum','Shares':'sum','Unique_Investment_Id':'count'}).reset_index().rename(columns={'Unique_Investment_Id':'Counts'})
grouped_advisor_aum_df.head()


Unnamed: 0,Unique_Advisor_Id,Month,Counts,Shares,AUM
0,12243,2016 / 01,44,109673.6785,1461389.0
1,12243,2016 / 02,44,106252.324833,1414581.0
2,12243,2016 / 03,44,103253.896694,1426161.0
3,12243,2016 / 04,44,100449.798917,1414247.0
4,12243,2016 / 05,44,91960.177528,1241367.0


Grouping data by unique investment vehicle id and month. Final output will store sums of the assets under managements and shares for each investment vehicle in particular month

In [5]:
grouped_investment_aum_df = aum_df.groupby(['Unique_Investment_Id','Month']).agg({'AUM': 'sum','Shares':'sum','Unique_Advisor_Id':'count'}).reset_index().rename(columns={'Unique_Advisor_Id':'Counts'})
grouped_investment_aum_df[grouped_investment_aum_df['Unique_Investment_Id'] == 3425].head()

Unnamed: 0,Unique_Investment_Id,Month,Counts,Shares,AUM
656,3425,2016 / 01,36,28135.758333,515757.171333
657,3425,2016 / 02,36,28135.758333,509040.019833
658,3425,2016 / 03,36,27256.321667,540765.4215
659,3425,2016 / 04,39,26081.948333,532694.127833
660,3425,2016 / 05,38,12041.815,247218.4625


Splitting Month column to take out month only for help in joining with different dataframes

In [6]:
grouped_investment_aum_df['Year'],grouped_investment_aum_df['Month']=grouped_investment_aum_df['Month'].str.split(' /', 1).str
grouped_advisor_aum_df['Year'],grouped_advisor_aum_df['Month']=grouped_advisor_aum_df['Month'].str.split(' /', 1).str

transaction_df['Year'],transaction_df['Month']=transaction_df['Month'].str.split(' /', 1).str
investment_exp_df['Year'],investment_exp_df['Month']=investment_exp_df['Month'].str.split(' /', 1).str
aum_df['Year'],aum_df['Month']=aum_df['Month'].str.split(' /', 1).str
activity_df['Year'],activity_df['Month']=activity_df['Month'].str.split(' /', 1).str

In [None]:
transaction_df.head()

Incerementing each Month with 1 so corresponding row can be mapped with transactions. Current month transaction is result of previous month experience/AUM/Shares

In [7]:
grouped_investment_aum_df['Mapping_Month']= grouped_investment_aum_df['Month'].astype(int).apply(lambda x : x+1)
grouped_advisor_aum_df['Mapping_Month']= grouped_advisor_aum_df['Month'].astype(int).apply(lambda x : x+1)
investment_exp_df['Mapping_Month']= investment_exp_df['Month'].astype(int).apply(lambda x : x+1)
aum_df['Mapping_Month']= aum_df['Month'].astype(int).apply(lambda x : x+1)
activity_df['Mapping_Month']= activity_df['Month'].astype(int).apply(lambda x : x+1)


grouped_investment_aum_df.head()


Unnamed: 0,Unique_Investment_Id,Month,Counts,Shares,AUM,Year,Mapping_Month
0,74,1,3,376166.733333,376166.733333,2016,2
1,74,2,3,376166.733333,376166.733333,2016,3
2,74,3,3,376172.733333,376172.733333,2016,4
3,74,4,3,376175.733333,376175.733333,2016,5
4,74,5,3,376178.833333,376178.833333,2016,6


Converting month in transaction history to integer to ease in join operations

In [8]:
transaction_df['Month'] =  transaction_df['Month'].astype(int).apply(lambda x : x+0)

transaction_df.head()

Unnamed: 0,Unique_Advisor_Id,Unique_Investment_Id,Month,Transaction_Type,Code_1,Code_2,Code_3,Code_4,Code_5,Amount,Year
0,1000103,20058,1,P,1,48,1,7,4,4678.666667,2016
1,1000103,20058,1,P,1,48,1,7,4,353.066667,2016
2,1000103,20058,1,P,1,48,1,7,4,1809.626667,2016
3,1000103,20058,2,P,1,48,1,7,4,4678.666667,2016
4,1000103,20058,2,P,1,48,1,7,4,353.066667,2016


In [9]:
final_transaction = pd.merge(transaction_df, grouped_investment_aum_df, left_on=["Month","Unique_Investment_Id"],right_on=["Mapping_Month","Unique_Investment_Id"], how="left") 
final_transaction = final_transaction.rename(columns={'Month_x': 'Month', 'AUM': 'AUM_investor','Year_x' : 'Year', 'Counts' : 'Counts_investor', 'Shares': 'Shares_investor','Month_y':'Month_actual'})
final_transaction = final_transaction.drop('Year_y', 1)
final_transaction.head()

Unnamed: 0,Unique_Advisor_Id,Unique_Investment_Id,Month,Transaction_Type,Code_1,Code_2,Code_3,Code_4,Code_5,Amount,Year,Month_actual,Counts_investor,Shares_investor,AUM_investor,Mapping_Month
0,1000103,20058,1,P,1,48,1,7,4,4678.666667,2016,,,,,
1,1000103,20058,1,P,1,48,1,7,4,353.066667,2016,,,,,
2,1000103,20058,1,P,1,48,1,7,4,1809.626667,2016,,,,,
3,1000103,20058,2,P,1,48,1,7,4,4678.666667,2016,1.0,2.0,132979.907,1339674.0,2.0
4,1000103,20058,2,P,1,48,1,7,4,353.066667,2016,1.0,2.0,132979.907,1339674.0,2.0


In [10]:
test_transaction = pd.merge(test_df, grouped_investment_aum_df[grouped_investment_aum_df["Mapping_Month"]== 13], on="Unique_Investment_Id", how="left")
test_transaction = test_transaction.rename(columns={ 'AUM': 'AUM_investor','Counts' : 'Counts_investor', 'Shares': 'Shares_investor'})

test_transaction.shape


(8714, 8)

In [11]:
final_transaction = pd.merge(final_transaction, grouped_advisor_aum_df, left_on=["Month","Unique_Advisor_Id"],right_on=["Mapping_Month","Unique_Advisor_Id"], how="left") 
final_transaction = final_transaction.rename(columns={'Month_x': 'Month', 'AUM': 'AUM_advisor','Year_x' : 'Year', 'Counts' : 'Counts_advisor', 'Shares': 'Shares_advisor','Mapping_Month_x': 'Mapping_Month'})
final_transaction = final_transaction.drop(['Year_y','Mapping_Month_y','Month_y'], 1)

final_transaction.head()

Unnamed: 0,Unique_Advisor_Id,Unique_Investment_Id,Month,Transaction_Type,Code_1,Code_2,Code_3,Code_4,Code_5,Amount,Year,Month_actual,Counts_investor,Shares_investor,AUM_investor,Mapping_Month,Counts_advisor,Shares_advisor,AUM_advisor
0,1000103,20058,1,P,1,48,1,7,4,4678.666667,2016,,,,,,,,
1,1000103,20058,1,P,1,48,1,7,4,353.066667,2016,,,,,,,,
2,1000103,20058,1,P,1,48,1,7,4,1809.626667,2016,,,,,,,,
3,1000103,20058,2,P,1,48,1,7,4,4678.666667,2016,1.0,2.0,132979.907,1339674.0,2.0,6.0,13536.409,245350.665667
4,1000103,20058,2,P,1,48,1,7,4,353.066667,2016,1.0,2.0,132979.907,1339674.0,2.0,6.0,13536.409,245350.665667


In [12]:
test_transaction = pd.merge(test_transaction, grouped_advisor_aum_df[grouped_advisor_aum_df["Mapping_Month"]== 13], on="Unique_Advisor_Id", how="left")

test_transaction = test_transaction.rename(columns={ 'AUM': 'AUM_advisor','Counts' : 'Counts_advisor', 'Shares': 'Shares_advisor','Month_x':'Month','Mapping_Month_x':'Mapping_Month'})
test_transaction = test_transaction.drop(['Year_x','Year_y','Mapping_Month_y','Month_y','Mapping_Month'], 1)

test_transaction.shape

(8714, 9)

In [13]:
print(investment_exp_df.shape)
investment_exp_df= investment_exp_df[investment_exp_df['Year']=='2016']
investment_exp_df.shape


(14208, 24)


(7104, 24)

In [14]:
final_transaction_with_exp = pd.merge(final_transaction, investment_exp_df, left_on=["Month","Unique_Investment_Id"],right_on=["Mapping_Month","Unique_Investment_Id"], how="left") 
final_transaction_with_exp = final_transaction_with_exp.rename(columns={'Month_x': 'Month', 'AUM': 'AUM_advisor','Year_x' : 'Year', 'Mapping_Month_x':'Mapping_Month'})
final_transaction_with_exp = final_transaction_with_exp.drop(['Year_y','Mapping_Month_y','Month_y'], 1)

final_transaction_with_exp.head()

Unnamed: 0,Unique_Advisor_Id,Unique_Investment_Id,Month,Transaction_Type,Code_1,Code_2,Code_3,Code_4,Code_5,Amount,...,10 Yr Return,1 Yr Excess Return vs Primary Ix,3 Yr Excess Return vs Primary Ix,5 Yr Excess Return vs Primary Ix,10 Yr Excess Return vs Primary Ix,1 Yr Excess Return vs Category Ix,3 Yr Excess Return vs Category Ix,5 Yr Excess Return vs Category Ix,10 Yr Excess Return vs Category Ix,Net Flows
0,1000103,20058,1,P,1,48,1,7,4,4678.666667,...,,,,,,,,,,
1,1000103,20058,1,P,1,48,1,7,4,353.066667,...,,,,,,,,,,
2,1000103,20058,1,P,1,48,1,7,4,1809.626667,...,,,,,,,,,,
3,1000103,20058,2,P,1,48,1,7,4,4678.666667,...,,-15.184451,-9.429957,,,-10.850458,-6.919882,,,-1618909000.0
4,1000103,20058,2,P,1,48,1,7,4,353.066667,...,,-15.184451,-9.429957,,,-10.850458,-6.919882,,,-1618909000.0


In [15]:
final_transaction_with_exp.columns.values.tolist()



['Unique_Advisor_Id',
 'Unique_Investment_Id',
 'Month',
 'Transaction_Type',
 'Code_1',
 'Code_2',
 'Code_3',
 'Code_4',
 'Code_5',
 'Amount',
 'Year',
 'Month_actual',
 'Counts_investor',
 'Shares_investor',
 'AUM_investor',
 'Mapping_Month',
 'Counts_advisor',
 'Shares_advisor',
 'AUM_advisor',
 'Morningstar Category',
 'Investment',
 'Rating',
 '1 Yr % Rank',
 '3 Yr % Rank',
 '5 Yr % Rank',
 '10 Yr % Rank',
 '1 Yr Return',
 '3 Yr Return',
 '5 Yr Return',
 '10 Yr Return',
 '1 Yr Excess Return vs Primary Ix',
 '3 Yr Excess Return vs Primary Ix',
 '5 Yr Excess Return vs Primary Ix',
 '10 Yr Excess Return vs Primary Ix',
 '1 Yr Excess Return vs Category Ix',
 '3 Yr Excess Return vs Category Ix',
 '5 Yr Excess Return vs Category Ix',
 '10 Yr Excess Return vs Category Ix',
 'Net Flows']

In [16]:
test_transaction_with_exp = pd.merge(test_transaction, investment_exp_df[investment_exp_df["Mapping_Month"]== 13 ], on="Unique_Investment_Id", how="left")

test_transaction_with_exp.shape


(8714, 32)

In [17]:
test_transaction_with_exp.columns.values.tolist()

['Unique_Advisor_Id',
 'Unique_Investment_Id',
 'Month_x',
 'Counts_investor',
 'Shares_investor',
 'AUM_investor',
 'Counts_advisor',
 'Shares_advisor',
 'AUM_advisor',
 'Morningstar Category',
 'Month_y',
 'Investment',
 'Rating',
 '1 Yr % Rank',
 '3 Yr % Rank',
 '5 Yr % Rank',
 '10 Yr % Rank',
 '1 Yr Return',
 '3 Yr Return',
 '5 Yr Return',
 '10 Yr Return',
 '1 Yr Excess Return vs Primary Ix',
 '3 Yr Excess Return vs Primary Ix',
 '5 Yr Excess Return vs Primary Ix',
 '10 Yr Excess Return vs Primary Ix',
 '1 Yr Excess Return vs Category Ix',
 '3 Yr Excess Return vs Category Ix',
 '5 Yr Excess Return vs Category Ix',
 '10 Yr Excess Return vs Category Ix',
 'Net Flows',
 'Year',
 'Mapping_Month']

In [18]:
required_train_df = final_transaction_with_exp.filter(['AUM_investor','Counts_investor','Shares_investor','AUM_advisor','Shares_advisor','Rating','1 Yr % Rank','3 Yr % Rank','1 Yr Return','3 Yr Return','1 Yr Excess Return vs Primary Ix','3 Yr Excess Return vs Primary Ix','1 Yr Excess Return vs Category Ix','3 Yr Excess Return vs Category Ix','Net Flows','Transaction_Type'])


print(required_train_df[required_train_df['AUM_investor'].isnull()])



        AUM_investor  Counts_investor  Shares_investor    AUM_advisor  \
0                NaN              NaN              NaN            NaN   
1                NaN              NaN              NaN            NaN   
2                NaN              NaN              NaN            NaN   
58               NaN              NaN              NaN            NaN   
59               NaN              NaN              NaN            NaN   
60               NaN              NaN              NaN            NaN   
61               NaN              NaN              NaN            NaN   
62               NaN              NaN              NaN            NaN   
94               NaN              NaN              NaN            NaN   
95               NaN              NaN              NaN            NaN   
96               NaN              NaN              NaN            NaN   
132              NaN              NaN              NaN            NaN   
133              NaN              NaN              

In [19]:
required_test_df = test_transaction_with_exp.filter(['AUM_investor','Counts_investor','Shares_investor','AUM_advisor','Shares_advisor','Rating','1 Yr % Rank','3 Yr % Rank','1 Yr Return','3 Yr Return','1 Yr Excess Return vs Primary Ix','3 Yr Excess Return vs Primary Ix','1 Yr Excess Return vs Category Ix','3 Yr Excess Return vs Category Ix','Net Flows'])


print(required_test_df[required_test_df['AUM_investor'].isnull()])

required_test_df=required_test_df.fillna(0)
required_test_df.shape

      AUM_investor  Counts_investor  Shares_investor   AUM_advisor  \
9              NaN              NaN              NaN           NaN   
107            NaN              NaN              NaN  7.324585e+06   
125            NaN              NaN              NaN  1.045836e+06   
128            NaN              NaN              NaN  1.045836e+06   
279            NaN              NaN              NaN  2.193472e+05   
280            NaN              NaN              NaN  2.193472e+05   
281            NaN              NaN              NaN  2.193472e+05   
289            NaN              NaN              NaN           NaN   
421            NaN              NaN              NaN           NaN   
422            NaN              NaN              NaN           NaN   
425            NaN              NaN              NaN           NaN   
427            NaN              NaN              NaN  4.026281e+06   
428            NaN              NaN              NaN  4.026281e+06   
429            NaN  

(8714, 15)

In [20]:

print(required_test_df[required_test_df['AUM_investor'].isnull()])

Empty DataFrame
Columns: [AUM_investor, Counts_investor, Shares_investor, AUM_advisor, Shares_advisor, Rating, 1 Yr % Rank, 3 Yr % Rank, 1 Yr Return, 3 Yr Return, 1 Yr Excess Return vs Primary Ix, 3 Yr Excess Return vs Primary Ix, 1 Yr Excess Return vs Category Ix, 3 Yr Excess Return vs Category Ix, Net Flows]
Index: []


In [21]:
print(required_train_df.shape)

required_train_df= required_train_df.fillna(0)
#required_train_df=required_train_df.dropna()
print(required_train_df.shape)

required_train_df.head()

(163722, 16)
(163722, 16)


Unnamed: 0,AUM_investor,Counts_investor,Shares_investor,AUM_advisor,Shares_advisor,Rating,1 Yr % Rank,3 Yr % Rank,1 Yr Return,3 Yr Return,1 Yr Excess Return vs Primary Ix,3 Yr Excess Return vs Primary Ix,1 Yr Excess Return vs Category Ix,3 Yr Excess Return vs Category Ix,Net Flows,Transaction_Type
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,P
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,P
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,P
3,1339674.0,2.0,132979.907,245350.665667,13536.409,1.0,99.0,98.0,-15.84976,1.871172,-15.184451,-9.429957,-10.850458,-6.919882,-1618909000.0,P
4,1339674.0,2.0,132979.907,245350.665667,13536.409,1.0,99.0,98.0,-15.84976,1.871172,-15.184451,-9.429957,-10.850458,-6.919882,-1618909000.0,P


In [22]:
print(required_test_df.shape)

required_test_df=required_test_df.dropna()
print(required_test_df.shape)

required_test_df.head()

(8714, 15)
(8714, 15)


Unnamed: 0,AUM_investor,Counts_investor,Shares_investor,AUM_advisor,Shares_advisor,Rating,1 Yr % Rank,3 Yr % Rank,1 Yr Return,3 Yr Return,1 Yr Excess Return vs Primary Ix,3 Yr Excess Return vs Primary Ix,1 Yr Excess Return vs Category Ix,3 Yr Excess Return vs Category Ix,Net Flows
0,129215000.0,621.0,6688148.0,202376.243875,23488.55675,5.0,47.0,6.0,14.60528,9.499912,2.645357,0.627968,-2.735172,0.914585,-2289336000.0
1,4000243.0,32.0,127255.5,202376.243875,23488.55675,3.0,21.0,67.0,17.536525,6.275912,0.196073,-2.309415,0.196073,-2.309415,-2289336000.0
2,2017696.0,16.0,67512.48,202376.243875,23488.55675,3.0,43.0,42.0,-1.12074,-1.447993,-5.61586,0.327666,-1.24552,-0.182217,-1486395000.0
3,35384020.0,170.0,2638633.0,202376.243875,23488.55675,5.0,2.0,3.0,8.946499,2.905355,9.349074,6.076022,9.349074,6.076022,-521750500.0
4,1498600.0,33.0,37259.22,202376.243875,23488.55675,4.0,46.0,31.0,11.020021,5.298992,-0.939901,-3.572952,-0.298074,0.247737,-900259100.0


In [23]:
# Split the data into features and target label
transaction_type = required_train_df['Transaction_Type']
features_raw = required_train_df.drop('Transaction_Type', axis = 1)
test_raw = required_test_df

In [24]:
# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler()
numerical = ['AUM_investor','Counts_investor','Shares_investor','AUM_advisor','Shares_advisor','1 Yr % Rank','3 Yr % Rank','1 Yr Return','3 Yr Return','1 Yr Excess Return vs Primary Ix','3 Yr Excess Return vs Primary Ix','1 Yr Excess Return vs Category Ix','3 Yr Excess Return vs Category Ix','Net Flows']
features_raw[numerical] = scaler.fit_transform(required_train_df[numerical])

test_raw[numerical] = scaler.fit_transform(required_test_df[numerical])
# Show an example of a record with scaling applied
display(features_raw.head(n = 5))

Unnamed: 0,AUM_investor,Counts_investor,Shares_investor,AUM_advisor,Shares_advisor,Rating,1 Yr % Rank,3 Yr % Rank,1 Yr Return,3 Yr Return,1 Yr Excess Return vs Primary Ix,3 Yr Excess Return vs Primary Ix,1 Yr Excess Return vs Category Ix,3 Yr Excess Return vs Category Ix,Net Flows
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.470239,0.58898,0.633898,0.690192,0.544173,0.511896,0.241232
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.470239,0.58898,0.633898,0.690192,0.544173,0.511896,0.241232
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.470239,0.58898,0.633898,0.690192,0.544173,0.511896,0.241232
3,0.002258,0.002099,0.028302,0.004533,0.004006,1.0,1.0,0.989899,0.094676,0.655548,0.151229,0.065839,0.230261,0.10167,0.150576
4,0.002258,0.002099,0.028302,0.004533,0.004006,1.0,1.0,0.989899,0.094676,0.655548,0.151229,0.065839,0.230261,0.10167,0.150576


In [None]:
# Produce a scatter matrix for each pair of features in the data

pd.scatter_matrix(features_raw, alpha = 0.3, figsize = (14,8), diagonal = 'kde');

In [25]:
# TODO: One-hot encode the 'features_raw' data using pandas.get_dummies()
features_raw.Rating = features_raw.Rating.astype(str)
test_raw.Rating = test_raw.Rating.astype(str)

features = pd.get_dummies(features_raw)
test = pd.get_dummies(test_raw)

# TODO: Encode the 'income_raw' data to numerical values
transaction_type = transaction_type.replace(['P','R'],[0,1])

# Print the number of features after one-hot encoding
encoded_train = list(features.columns)
encoded_test = list(test.columns)
print ("{} total features after one-hot encoding.".format(len(encoded_train)))
print ("{} total test features after one-hot encoding.".format(len(encoded_test)))


#Uncomment the following line to see the encoded feature names
#print (encoded)

20 total features after one-hot encoding.
20 total test features after one-hot encoding.


NameError: name 'encoded' is not defined

In [26]:
# Import train_test_split
from sklearn.cross_validation import train_test_split

# Split the 'features' and 'transaction_type' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, transaction_type, test_size = 0.2, random_state = 0)

# Show the results of the split
print ("Training set has {} samples.".format(X_train.shape[0]))
print ("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 130977 samples.
Testing set has 32745 samples.




In [27]:
# TODO: Import two metrics from sklearn - fbeta_score and accuracy_score
from sklearn.metrics import fbeta_score, accuracy_score
# time.time() return time in seconds since the Epoch
from time import time

beta = 0.5

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # TODO: Fit the learner to the training data using slicing with 'sample_size'
    X_train = X_train[:sample_size]
    y_train = y_train[:sample_size]
    
    start = time() # Get start time
    learner.fit(X_train, y_train)
    end = time() # Get end time
    
    # TODO: Calculate the training time
    results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set,
    #       then get predictions on the first 300 training samples
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train)
    end = time() # Get end time
    
    # TODO: Calculate the total prediction time
    results['pred_time'] = end-start
            
    # TODO: Compute accuracy on the first 300 training samples
    results['acc_train'] = accuracy_score(y_train, predictions_train)
        
    # TODO: Compute accuracy on test set
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    # TODO: Compute F-score on the the first 300 training samples
    results['f_train'] = fbeta_score(y_train, predictions_train, beta=beta)
        
   # TODO: Compute F-score on the test set
    results['f_test'] = fbeta_score(y_test, predictions_test, beta=beta)
       
    # Success
    print ("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
        
    # Return the results
    return results

In [28]:
# TODO: Import the three supervised learning models from sklearn
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# TODO: Initialize the three models
clf_A = LogisticRegression(random_state=101)
clf_B = LinearSVC(random_state=101)
clf_C = GaussianNB()
clf_Ada = AdaBoostClassifier()
clf_Grad = GradientBoostingClassifier()
clf_KNN = KNeighborsClassifier()
clf_Dec = DecisionTreeClassifier()
clf_SGD = SGDClassifier()

# TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data
n_train = len(y_train)
samples_1 = int(n_train * 0.01)
samples_10 = int(n_train * 0.1)
samples_100 = n_train

# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C, clf_Ada, clf_Grad,clf_KNN ,clf_Dec, clf_SGD]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = \
        train_predict(clf, samples, X_train, y_train, X_test, y_test)

LogisticRegression trained on 1309 samples.
LogisticRegression trained on 13097 samples.
LogisticRegression trained on 130977 samples.
LinearSVC trained on 1309 samples.
LinearSVC trained on 13097 samples.
LinearSVC trained on 130977 samples.
GaussianNB trained on 1309 samples.
GaussianNB trained on 13097 samples.
GaussianNB trained on 130977 samples.
AdaBoostClassifier trained on 1309 samples.
AdaBoostClassifier trained on 13097 samples.
AdaBoostClassifier trained on 130977 samples.
GradientBoostingClassifier trained on 1309 samples.
GradientBoostingClassifier trained on 13097 samples.
GradientBoostingClassifier trained on 130977 samples.
KNeighborsClassifier trained on 1309 samples.
KNeighborsClassifier trained on 13097 samples.
KNeighborsClassifier trained on 130977 samples.
DecisionTreeClassifier trained on 1309 samples.
DecisionTreeClassifier trained on 13097 samples.
DecisionTreeClassifier trained on 130977 samples.
SGDClassifier trained on 1309 samples.
SGDClassifier trained on 

  'precision', 'predicted', average, warn_for)


SGDClassifier trained on 130977 samples.


In [29]:
display(results)

# Run metrics visualization for the three supervised learning models chosen
#vs.evaluate(results, accuracy, fscore)

{'AdaBoostClassifier': {0: {'acc_test': 0.64309054817529399,
   'acc_train': 0.69442322383498856,
   'f_test': 0.69009822403016174,
   'f_train': 0.73468391505979991,
   'pred_time': 0.22156953811645508,
   'train_time': 0.12124085426330566},
  1: {'acc_test': 0.65738280653534886,
   'acc_train': 0.65839505230205386,
   'f_test': 0.70308239238895509,
   'f_train': 0.70553625320179558,
   'pred_time': 0.2965843677520752,
   'train_time': 0.5187268257141113},
  2: {'acc_test': 0.65781035272560695,
   'acc_train': 0.65746657810149878,
   'f_test': 0.70268891423083846,
   'f_train': 0.70230032415604826,
   'pred_time': 1.0294520854949951,
   'train_time': 4.534843444824219}},
 'DecisionTreeClassifier': {0: {'acc_test': 0.57660711559016642,
   'acc_train': 0.9327731092436975,
   'f_test': 0.63649251124853623,
   'f_train': 0.93789768388903016,
   'pred_time': 0.005651950836181641,
   'train_time': 0.008510589599609375},
  1: {'acc_test': 0.60192395785616126,
   'acc_train': 0.87905627242880

In [96]:
# TODO: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer

# TODO: Initialize the classifier
clf = LogisticRegression()

# TODO: Create the parameters list you wish to tune
parameters = {'solver': ['newton-cg', 'lbfgs', 'sag'],
              'C': [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
              'random_state': [None, 101, 20160101]}

# TODO: Make an fbeta_score scoring object
scorer = make_scorer(fbeta_score, beta=beta)

# TODO: Perform grid search on the classifier using 'scorer' as the scoring method
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_fit = grid_obj.fit(X_train, y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print ("Unoptimized model\n------")
print ("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print ("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print ("\nOptimized Model\n------")
print ("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print ("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))

# show best parameters
print ("\nBest Classifier\n------")
print (best_clf)



Unoptimized model
------
Accuracy score on testing data: 0.6670
F-score on testing data: 0.7099

Optimized Model
------
Final accuracy score on the testing data: 0.6666
Final F-score on the testing data: 0.7096

Best Classifier
------
LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)


In [115]:
from sklearn.externals import joblib

best_clf= clf_A
filename = 'logistic_regression_model_invesco.joblib.pkl'

_ = joblib.dump(best_clf, filename, compress=9)

In [116]:
print(best_clf)

clf_loaded = joblib.load(filename)

print(clf_loaded)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=101, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=101, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [139]:
pred = clf_loaded.predict(test)
pred_prob = clf_loaded.predict_proba(test)



In [149]:
pred_prob[:,1]

array([ 0.78331917,  0.69380115,  0.68225371, ...,  0.88846935,
        0.78268406,  0.75147528])

In [150]:
pred_prob = pd.DataFrame(pred_prob[:,1],columns=["Propensity_Score"])
pred_prob.head()

Unnamed: 0,Propensity_Score
0,0.783319
1,0.693801
2,0.682254
3,0.664828
4,0.678845


In [138]:
pred_df= pd.DataFrame(pred,columns=["Redeem_Status"])

pred_df=pred_df.replace([0,1],['NO','YES'])
pred_df.head()

Unnamed: 0,Redeem_Status
0,YES
1,YES
2,YES
3,YES
4,YES


In [151]:
result = pd.concat([test_df, pred_prob, pred_df], axis=1)

result.head()


Unnamed: 0,Unique_Advisor_Id,Unique_Investment_Id,Propensity_Score,Redeem_Status
0,1000103,14147,0.783319,YES
1,1000103,3534,0.693801,YES
2,1000103,3651,0.682254,YES
3,1000103,7668,0.664828,YES
4,1000103,9339,0.678845,YES


In [153]:
result.to_csv('test_data.csv',index=False)