In [17]:
import sklearn 
from sklearn.datasets import fetch_openml
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pytest
import numpy as np


In [3]:
market_df = fetch_openml(name="bank-marketing")
market_df = pd.DataFrame(market_df['data'])
market_df = market_df[['V1', 'V2', 'V3', 'V4', 'V9', 'V12', 'V16']]
market_df = market_df.rename(columns={'V1': 'age', 'V2': 'job', 'V3': 'marital', 'V4': 'education', 'V9': 'contact', 'V12': 'duration', 'V16':'outcome'})
market_df

  warn(


Unnamed: 0,age,job,marital,education,contact,duration,outcome
0,58.0,management,married,tertiary,unknown,261.0,unknown
1,44.0,technician,single,secondary,unknown,151.0,unknown
2,33.0,entrepreneur,married,secondary,unknown,76.0,unknown
3,47.0,blue-collar,married,unknown,unknown,92.0,unknown
4,33.0,unknown,single,unknown,unknown,198.0,unknown
...,...,...,...,...,...,...,...
45206,51.0,technician,married,tertiary,cellular,977.0,unknown
45207,71.0,retired,divorced,primary,cellular,456.0,unknown
45208,72.0,retired,married,secondary,cellular,1127.0,success
45209,57.0,blue-collar,married,secondary,telephone,508.0,unknown


In [68]:
market_df['outcome'].value_counts()

failure    4901
success    1511
other         0
unknown       0
Name: outcome, dtype: int64

In [4]:
market_df = market_df[(market_df.outcome == 'failure')|(market_df.outcome == 'success')]

In [5]:
market_df[market_df.columns.difference(['outcome'])]

Unnamed: 0,age,contact,duration,education,job,marital
0,58.0,unknown,261.0,tertiary,management,married
1,44.0,unknown,151.0,secondary,technician,single
2,33.0,unknown,76.0,secondary,entrepreneur,married
3,47.0,unknown,92.0,unknown,blue-collar,married
4,33.0,unknown,198.0,unknown,unknown,single
...,...,...,...,...,...,...
45206,51.0,cellular,977.0,tertiary,technician,married
45207,71.0,cellular,456.0,primary,retired,divorced
45208,72.0,cellular,1127.0,secondary,retired,married
45209,57.0,telephone,508.0,secondary,blue-collar,married


In [88]:
pd.DataFrame.from_dict({'age': [25], 'contact': ['cellular'], 'duration': [30], 'education': ['secondary'], 'job': ['student'], 'marital': ['single']})

Unnamed: 0,age,contact,duration,education,job,marital
0,75,cellular,900,secondary,retired,divorced


In [65]:
market_df['age'].describe()

count    6412.000000
mean       41.299595
std        11.496071
min        18.000000
25%        33.000000
50%        38.000000
75%        48.000000
max        93.000000
Name: age, dtype: float64

In [8]:
market_df['education'].value_counts()

secondary    3264
tertiary     2114
primary       776
unknown       258
Name: education, dtype: int64

In [89]:
feature_train, feature_test, label_train, label_test = train_test_split(market_df[market_df.columns.difference(['outcome'])], market_df['outcome'], 
                                                                                  test_size=0.2, shuffle=True, stratify=market_df['outcome'], random_state=607)
feature_test_ref = feature_test.copy()                                                                   

In [90]:
feature_train, feature_test, label_train, label_test = train_test_split(market_df[market_df.columns.difference(['outcome'])], market_df['outcome'], 
                                                                                  test_size=0.2, shuffle=True, stratify=market_df['outcome'], random_state=607)
ct = ColumnTransformer([('scale', StandardScaler(), ['age', 'duration']), ('onehot', OneHotEncoder(), ['job', 'marital', 'contact'])])
feature_train = ct.fit_transform(feature_train)
feature_test = ct.transform(feature_test)

In [97]:
ct.get_params

<bound method ColumnTransformer.get_params of ColumnTransformer(transformers=[('scale', StandardScaler(),
                                 ['age', 'duration']),
                                ('onehot', OneHotEncoder(),
                                 ['job', 'marital', 'contact'])])>

In [103]:
float(feature_train[0, 0])

-1.253294838763468

In [6]:
gbc = RandomForestClassifier(class_weight='balanced')
gbc.fit(feature_train, label_train)
predictions = gbc.predict(feature_test)

print("\nClassification Results: \n", classification_report(label_test, predictions))



Classification Results: 
               precision    recall  f1-score   support

     failure       0.79      0.90      0.84       981
     success       0.42      0.24      0.30       302

    accuracy                           0.74      1283
   macro avg       0.61      0.57      0.57      1283
weighted avg       0.70      0.74      0.72      1283



In [31]:
list(predictions[-20:])

['failure',
 'failure',
 'failure',
 'failure',
 'failure',
 'success',
 'success',
 'failure',
 'failure',
 'success',
 'success',
 'failure',
 'failure',
 'failure',
 'success',
 'success',
 'failure',
 'failure',
 'failure',
 'failure']

In [41]:
label_test

0       failure
1       failure
2       success
3       success
4       success
         ...   
1278    failure
1279    failure
1280    failure
1281    failure
1282    failure
Name: outcome, Length: 1283, dtype: category
Categories (4, object): ['failure', 'other', 'success', 'unknown']

In [36]:
label_test.reset_index(drop=True, inplace=True)
feature_test.reset_index(drop=True, inplace=True)
feature_test_ref.reset_index(drop=True, inplace=True)

In [67]:
feature_test_ref

Unnamed: 0,age,contact,duration,education,job,marital
0,40.0,cellular,74.0,secondary,management,married
1,31.0,cellular,148.0,primary,blue-collar,married
2,76.0,cellular,344.0,tertiary,retired,married
3,28.0,cellular,940.0,tertiary,management,single
4,61.0,cellular,232.0,secondary,retired,married
...,...,...,...,...,...,...
1278,32.0,cellular,182.0,tertiary,management,single
1279,31.0,cellular,424.0,secondary,management,married
1280,49.0,cellular,260.0,tertiary,self-employed,divorced
1281,59.0,cellular,818.0,tertiary,management,married


In [72]:
feature_test_ref['marital']

0        married
1        married
2        married
3         single
4        married
          ...   
1278      single
1279     married
1280    divorced
1281     married
1282      single
Name: marital, Length: 1283, dtype: category
Categories (3, object): ['divorced', 'married', 'single']

In [75]:
for elems in set(feature_test_ref['marital']): 
    assert elems in ('married', 'single', 'divorced')

In [76]:
set(feature_test_ref['education'])

{'primary', 'secondary', 'tertiary', 'unknown'}

In [46]:
label_test[label_test == 'success']

2       success
3       success
4       success
9       success
10      success
         ...   
1263    success
1267    success
1271    success
1273    success
1277    success
Name: outcome, Length: 302, dtype: category
Categories (4, object): ['failure', 'other', 'success', 'unknown']

In [53]:
predictions = pd.Series(predictions)
predictions[predictions == 'success']

2       success
3       success
4       success
5       success
6       success
         ...   
1269    success
1272    success
1273    success
1277    success
1278    success
Length: 172, dtype: object

In [55]:
predictions[:10]

0    failure
1    failure
2    success
3    success
4    success
5    success
6    success
7    failure
8    failure
9    failure
dtype: object

In [63]:
feature_test_ref.iloc[9]

age                31.0
contact        cellular
duration          158.0
education      tertiary
job          management
marital          single
Name: 9, dtype: object

---

SyntaxError: invalid syntax (<ipython-input-73-29e0c3615294>, line 1)

In [77]:
churn_df = fetch_openml(name='churn', version=1)
churn_df = pd.DataFrame(churn_df['data'])
#churn_df = churn_df[churn_df[['state', 'account_length', 'area_code', 'international_plan', 'voice_mail_plan', 'total_day_minutes', 'total_intl_minutes']]]
churn_df

Unnamed: 0,state,account_length,area_code,phone_number,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
0,16.0,128.0,415,2845.0,0,1,25.0,265.1,110.0,45.07,197.4,99.0,16.78,244.7,91.0,11.01,10.0,3.0,2.70,1
1,35.0,107.0,415,2301.0,0,1,26.0,161.6,123.0,27.47,195.5,103.0,16.62,254.4,103.0,11.45,13.7,3.0,3.70,1
2,31.0,137.0,415,1616.0,0,0,0.0,243.4,114.0,41.38,121.2,110.0,10.30,162.6,104.0,7.32,12.2,5.0,3.29,0
3,35.0,84.0,408,2510.0,1,0,0.0,299.4,71.0,50.90,61.9,88.0,5.26,196.9,89.0,8.86,6.6,7.0,1.78,2
4,36.0,75.0,415,155.0,1,0,0.0,166.7,113.0,28.34,148.3,122.0,12.61,186.9,121.0,8.41,10.1,3.0,2.73,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,11.0,50.0,408,2000.0,0,1,40.0,235.7,127.0,40.07,223.0,126.0,18.96,297.5,116.0,13.39,9.9,5.0,2.67,2
4996,49.0,152.0,415,394.0,0,0,0.0,184.2,90.0,31.31,256.8,73.0,21.83,213.6,113.0,9.61,14.7,2.0,3.97,3
4997,7.0,61.0,415,313.0,0,0,0.0,140.6,89.0,23.90,172.8,128.0,14.69,212.4,97.0,9.56,13.6,4.0,3.67,1
4998,7.0,109.0,510,3471.0,0,0,0.0,188.8,67.0,32.10,171.7,92.0,14.59,224.4,89.0,10.10,8.5,6.0,2.30,0


In [97]:
data = arff.loadarff('churn.arff')
churn_df = pd.DataFrame(data[0])
cols = ['area_code', 'international_plan', 'voice_mail_plan', 'number_customer_service_calls', 'class']
churn_df[cols] = churn_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)
churn_df = churn_df[['account_length', 'area_code', 'international_plan', 'voice_mail_plan', 'total_day_minutes', 'total_intl_minutes', 'number_customer_service_calls', 'class']]
churn_df

Unnamed: 0,account_length,area_code,international_plan,voice_mail_plan,total_day_minutes,total_intl_minutes,number_customer_service_calls,class
0,128.0,415,0,1,265.1,10.0,1,0
1,107.0,415,0,1,161.6,13.7,1,0
2,137.0,415,0,0,243.4,12.2,0,0
3,84.0,408,1,0,299.4,6.6,2,0
4,75.0,415,1,0,166.7,10.1,3,0
...,...,...,...,...,...,...,...,...
4995,50.0,408,0,1,235.7,9.9,2,0
4996,152.0,415,0,0,184.2,14.7,3,1
4997,61.0,415,0,0,140.6,13.6,1,0
4998,109.0,510,0,0,188.8,8.5,0,0


In [102]:
churn_df['class'].value_counts()

0    4293
1     707
Name: class, dtype: int64

In [108]:
feature_train, feature_test, label_train, label_test = train_test_split(churn_df[churn_df.columns.difference(['class'])], churn_df['class'], 
                                                                                  test_size=0.2, shuffle=True, stratify=churn_df['class'], random_state=607)
ct = ColumnTransformer([('scale', StandardScaler(), ['account_length', 'total_day_minutes', 'total_intl_minutes']), ('onehot', OneHotEncoder(), ['area_code'])])
feature_train = ct.fit_transform(feature_train)
feature_test = ct.transform(feature_test)

In [109]:
gbc = RandomForestClassifier()
gbc.fit(feature_train, label_train)
predictions = gbc.predict(feature_test)

print("\nClassification Results: \n", classification_report(label_test, predictions))


Classification Results: 
               precision    recall  f1-score   support

           0       0.88      0.97      0.92       859
           1       0.52      0.23      0.32       141

    accuracy                           0.86      1000
   macro avg       0.70      0.60      0.62      1000
weighted avg       0.83      0.86      0.84      1000

