In [188]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [189]:
data = pd.read_csv(r"D:\datasets\investors.csv")
data = data.rename(columns={'Which best describes your gender?':'gender'})
data = data.drop(columns=['Username', 'Timestamp'])
data.head(2)

Unnamed: 0,gender,Age,occupation,education,Investment,proportion,rank,rank.1,rank.2,rank.3,...,monitor_investment,expect_return,Preferred_avenue,Savings_objective,Reason_equity,Reason_MF,purpose_investment,Reason_gb,Reason_fd,source
0,Male,39,Salaried,Post Graduate,Yes,10% - 20%,3,3,3,3,...,Monthly,10% - 20%,Fixed Deposits,Retirement Plan,Power of compounding,Better Returns,Savings for retirement,Safe Investment,Fixed returns,Newspapers and Magazines
1,Male,30,Salaried,Post Graduate,Yes,20% - 30%,3,6,2,2,...,Weekly,10% - 20%,Equity,Retirement Plan,Capital Appreciation,Better Returns,Wealth creation,Risk-free returns,Fixed returns,Internet


In [190]:
 data.describe()

Unnamed: 0,Age,rank,rank.1,rank.2,rank.3,rank.4,rank.5,rank.6
count,132.0,132.0,132.0,132.0,132.0,132.0,132.0,132.0
mean,32.992424,4.795455,4.537879,3.416667,4.090909,4.113636,4.772727,4.492424
std,10.924792,1.860553,1.775169,1.548413,1.758098,1.839547,1.859947,1.696409
min,18.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,25.75,4.0,3.0,2.0,3.0,3.0,4.0,3.0
50%,30.0,5.0,5.0,4.0,4.0,4.0,5.0,5.0
75%,37.25,7.0,6.0,4.0,5.0,6.0,6.25,6.0
max,70.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0


In [191]:
data.isna().sum()

gender                  0
Age                     0
occupation              0
education               0
Investment              0
proportion              0
rank                    0
rank.1                  0
rank.2                  0
rank.3                  0
rank.4                  0
rank.5                  0
rank.6                  0
stock_invest            0
factors                 0
Investment_objective    0
investment_time         0
monitor_investment      0
expect_return           0
Preferred_avenue        0
Savings_objective       0
Reason_equity           0
Reason_MF               0
purpose_investment      0
Reason_gb               0
Reason_fd               0
source                  0
dtype: int64

In [192]:
x = data[['Age', 'gender', 'education', 'occupation','rank.1','rank.2','rank.3','rank.4','rank.5','rank.6', 'proportion', ]]
y = data['Investment']

In [193]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
column = make_column_transformer(
    (OneHotEncoder(), ['occupation', 'education', 'gender', 'proportion']),
    remainder='passthrough'
)

In [194]:
x = column.fit_transform(x)

In [195]:
x = np.asanyarray(x)
y = np.asanyarray(y)

In [196]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

In [197]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [198]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier



In [199]:
model_params = { 
    'DecisionTreeClassifier': {
       'model': DecisionTreeClassifier(),
       'params': {'max_depth': [6, 8, 10],
                  'min_samples_split':[2, 4, 8],
                  'min_samples_leaf': [2, 4, 8]}
},
'RandomForestClassifier': {
    'model': RandomForestClassifier(),
    'params': {'n_estimators':[100, 125, 150],
               'max_depth':[8, 10, 12]}
},
'MLPClassifier':{
    'model': MLPClassifier(),
    'params': {'hidden_layer_sizes':[16, 64, 80, 100],
               'max_iter':[50, 100, 125]}
},
'SVC':{
    'model': SVC(),
    'params':{'kernel':['sigmoid', 'linear', 'rbf'],
              'C': [1, 10, 20]}
}
}

In [200]:
from sklearn.model_selection import GridSearchCV
scores = []
for model_name, mp in model_params.items():
    test = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    test.fit(x, y)
    scores.append({
        'model': model_name,
        'best_score': test.best_score_,
        'best_params': test.best_params_
    })



In [201]:
dataframe = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
dataframe

Unnamed: 0,model,best_score,best_params
0,DecisionTreeClassifier,0.72735,"{'max_depth': 6, 'min_samples_leaf': 8, 'min_s..."
1,RandomForestClassifier,0.787749,"{'max_depth': 12, 'n_estimators': 150}"
2,MLPClassifier,0.803419,"{'hidden_layer_sizes': 64, 'max_iter': 125}"
3,SVC,0.780342,"{'C': 1, 'kernel': 'rbf'}"


In [202]:
model = MLPClassifier(hidden_layer_sizes=64, max_iter=125)
model.fit(x_train, y_train)



In [203]:

model.score(x_train, y_train)

0.8952380952380953

In [204]:
model.score(x_test, y_test)

0.8518518518518519

In [211]:
x_train = pd.DataFrame(x_train)
x_new = x_train.sample(2, random_state=99)

In [212]:
y_pred_new = model.predict(x_new)
y_pred_new

array(['Yes', 'Yes'], dtype='<U3')