In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
from math import ceil

## Python Basics

In [None]:
# variable assignment and printing
a = 5
b = 3
c = a + b
print(c)

In [None]:
# lists and for loops
list_of_colors = ["Violet", "Blue", "Green", "Yellow", "Orange", "Red"]
for color in list_of_colors:
    #tab indicates that you are inside the loop
    message = "This is an item in the list: " + color
    print(message)
#un-tab indicates that you are done with the for loop
print("There are no more items in the list")
    

In [None]:
# dictionaries: key-value pairs
student_ages = {"Jack":15, "Sarah":17, "Greg": 12, "Jamie":14, "Lucy": 12}
jamies_age = student_ages["Jamie"]
print(jamies_age)

## Load Data

In [None]:
df0 = pd.read_csv("../data/merged_data_year_0.csv", index_col=0)

## Inspect and Clean Data

In [None]:
df0.head()

In [None]:
print("(n rows, n features):")
print(df0.shape)

In [None]:
# One thousand features so let's look at a few
var =  df0.iloc[:,100]
print(var.name)
print(var.head())
var.value_counts().plot(kind="bar")
plt.show()


var =  df0.iloc[:,200]
print(var.name)
print(var.head())
var.value_counts().plot(kind="bar")
plt.show()


var =  df0.iloc[:,300]
print(var.name)
print(var.head())
var.value_counts().plot(kind="bar")
plt.show()


What do we notice?
    
    
Some values stick out: 96, 98.  What should we do about them?
    
Data type is "Object" which usually means it's represented as a string.  
Should we convert them to numbers?
    

In [None]:
# From inspection and reading documentation we see some values that don't look right (Blanks, 99, 9999)
## Cleaning empty fields and useless values, encode all as 98
df0.replace(r'^\s*$', '98', regex=True, inplace = True)
df0.replace('^9[0-9]+', '98', regex=True, inplace = True)

#### Inspect the target vairable
There are three pontential targets but we will use YPSUP07

PSUP07  B  Ran away from home in last year                               

          Value    Label

              1    Never
              2    A few times
              3    Lots of times
             96 M  More than 1 tick
             98 M  Missing


In [None]:
# Inspect the target variaqble
print(df0['YPSUP07'].value_counts())
df0['YPSUP07'].value_counts().plot("bar")

In [None]:
# remove participants wiothout a valid answer to target vairable
df0_new = df0[df0['YPSUP07'] != '98']

In [None]:
# Convert to a bianary queastion to simplify the problem: "Did run away AT ALL inm the last year
df0_new['YPSUP07'].replace('1', '0', inplace=True)
df0_new['YPSUP07'].replace('2', '1', inplace=True)
df0_new['YPSUP07'].replace('3', '1', inplace=True)

In [None]:
vc = df0_new['YPSUP07'].value_counts()
print("Binarised Target Variable:")
print("0 == Never ran away, 1 == Ran away at least once: ")
print(df0_new['YPSUP07'].value_counts())
print("Percent that ran away: ", round(vc["1"]/len(df0_new),3)*100, "%")
vc.plot(kind="bar")

In [None]:
#Other target variable:
df0_new['YPSUP06'].value_counts().plot(kind="bar")
plt.show()

#YPSUP05 isn't in year 1
#df0_new['YPSUP05'].value_counts().plot(kind="bar")
#plt.show()

### Prepare data for machine learning

In [None]:
from sklearn.model_selection import train_test_split
# to build the models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

#metrics:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score




In [None]:
target = df0_new['YPSUP07']
features = df0_new.drop(['YPSUP05', 'YPSUP06','YPSUP07', "DMMYID"], 1, errors= "ignore") # 'YPSUP05','YPSUP06','YPSUP07

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.2, 
                                                        random_state=0, stratify=target)

In [None]:
def print_model_full_report(y_test, y_pred):
    print('Accuracy: ', "%.3f" % (accuracy_score(y_test, y_pred)),'\n')
    print(pd.DataFrame(confusion_matrix(y_test, y_pred), 
                       columns=['PREDICTED 0', 'PREDICTED 1'],
                       index=['ACTUAL 0', 'ACTUAL 1']),'\n')
    print(classification_report(y_test, y_pred, target_names=['0','1']))
    
def print_model_report(y_test, y_pred):
    print('\t Accuracy: ', "%.2f" % (accuracy_score(y_test, y_pred)),
          '\t Recall:   ', "%.2f" % (recall_score(y_test, y_pred, pos_label='1')),
          '\t Precision:', "%.2f" % (precision_score(y_test, y_pred, pos_label='1')),
          '\t F1:', "%.2f" % (f1_score(y_test, y_pred, pos_label='1')),'\n')

In [None]:
logreg = LogisticRegression(random_state=0)


In [None]:
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
logreg.score(X_test,y_test)

Job done?

Not quite....

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred), 
                       columns=['Predicted No Risk (0)', 'Predicted Risk (1)'],
                       index=['Actual No Risk (0)', 'Actual Risk (0)'])

Return to slides for a dsicussion on model evaluation and data inbalence

In [None]:
print(classification_report(y_test, y_pred))

### Undersampling

In [None]:
# split the training data up by target response
train_data = pd.concat([X_train, y_train], axis=1) 
train_stayhome = train_data[train_data.YPSUP07 == '0']
train_runaways = train_data[train_data.YPSUP07 == '1']

In [None]:
#calculate the number of stay at homes we want to keep
runaway_2_stayhome_ratio = 2
num_stayhome = len(train_stayhome)
num_runaways = len(train_runaways)
num_undersample = int(ceil(runaway_2_stayhome_ratio * num_runaways))
print("number train missing", num_runaways)
print("undersamples number train not missing", num_undersample)


In [None]:
# under sample stay at homes and recombine with ran aways
train_stayhome_undersample = train_stayhome.sample(n=num_undersample, random_state=0)
train_under = pd.concat([train_runaways, train_stayhome_undersample]) 

In [None]:
# re-split features and targets
X_train_under = train_under.drop(['YPSUP07'], 1)
y_train_under = train_under['YPSUP07']

In [None]:
y_train_under.value_counts().plot(kind="bar")

In [None]:
y_test.value_counts().plot("bar")

### Re-train Model with undersampled data

In [None]:
logreg = LogisticRegression(random_state=0)
logreg.fit(X_train_under, y_train_under)
y_pred = logreg.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

Return to slides for discussion on Random Forests

## Random Forest

In [None]:
rf_mdl= RandomForestClassifier(random_state=0)



In [None]:
rf_mdl.fit(X_train_under, y_train_under)

In [None]:
y_pred = rf_mdl.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

### Feature Importance

In [None]:
rf_mdl.feature_importances_

In [None]:
fi = pd.Series(rf_mdl.feature_importances_)
fi.index = X_train_under.columns
fi.sort_values(inplace=True, ascending=False)


In [None]:
fi.head()

In [None]:
fi.sum()

In [None]:
fi[:10].plot.bar(figsize=(12,6))

In [None]:
fi[:20].plot.bar(figsize=(12,6))

## Exercises 

In [None]:
params = [0] # Fill this in with the values you want to test
precision = {}
recall = {}
for p in params:
    # train and test a model and add the: precision and recall to the respective ditionaries
    precision[p] = 0 #fill this in (hint: use the function recall_score() and shift-tab to see what parameters it takes)
    recall[p] = 0 #fill this in  (hint: use the function precision_score() and shift-tab to see what parameters it takes)
    
    
pd.DataFrame({"precision":precision, "recall":recall}).plot()   