# We will be using randomforest, logistic regression, KNN and SGD.

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
df = pd.read_csv('data_for_training.csv', index_col = 0)

In [None]:
df.drop(df[df['LOCALE'].isin(['–','†' ]) == True].index, inplace = True)

In [None]:
df.reset_index( inplace = True, drop = True)

Separate independent features from the labels to train data.

In [None]:
Y_HI = df['HI-LABEL']
Y_STRICT = df['STRICT-LABEL']

In [None]:
df.drop(columns =['HI-LABEL', 'STRICT-LABEL', 'FREE LUNCH ELIGIBLE'], inplace = True)

In [None]:
title_i = df.drop(columns = 'TITLE I ELIGIBLE', inplace = True)

In [None]:
df.head()

In [None]:
df.drop(columns = ['AMERICAN INDIAN'], inplace = True)

In [None]:
state_code = {
    'CA ':1,'WY ':2,'MO ':3,'TX ':4,'OK ':5,'MS ':6, 'KY ':7,'ID ':8,'OR ':9,'WI ':10, 'ME ':11,'MA ':12,'NC ':13,
    'VA ':14, 'OH ':15,'IN ':16, 'AL ':17, 'IA ':18, 'FL ':19, 'LA ':20, 'CT ':21, 'SC ':22, 'GA ':23, 'MD ':24,
    'NE ':25, 'SD ':26, 'MT ':27, 'RI ':28, 'UT ':29, 'WV ':30, 'AR ':31, 'NV ':32, 'HI ':33, 'DE ':34, 'NY ':35,
    'DE ':37, 'NY ':38, 'MN ':39, 'NM ':40, 'NJ ':41,'PA ':42, 'ND ':43, 'NH ': 44, 'MI ':45, 'KS ':46, 'CO ':47,
    'VT ':48, 'DC ':49, 'WA ':50, 'IL ':51, 'AK ': 52, 'AZ ':53,'TN ':54
}

In [None]:
locale_code = {
    '3-Urban fringe of large city': 101, '6-Small town': 102, '5-Large town': 103, '2-Midsize city':104, 
    '8-Rural - inside CBSA/MSA':105,'7-Rural - outside CBSA/MSA':106, '4-Urban fringe of midsize city':107,
    '1-Large city':108, '41-Rural: Fringe':109, '22-Suburb: Mid-size':110, '21-Suburb: Large':111, 
    '42-Rural: Distant':112, '11-City: Large':113, '13-City: Small':114, '43-Rural: Remote':115,
    '33-Town: Remote':116, '12-City: Mid-size':117, '32-Town: Distant':118, '31-Town: Fringe':119,
    '23-Suburb: Small':120
}

In [None]:
df['ABBR'] = df['ABBR'].apply(lambda x : state_code[x] if x in state_code else 0)
df['ABBR'] = df['ABBR'].astype('category')
df['ABBR'] = df['ABBR'].astype('str')

In [None]:
df['LOCALE'] = df['LOCALE'].apply(lambda x : locale_code[x] if x in locale_code else 121)
df['LOCALE'] = df['LOCALE'].astype('category')
df['LOCALE'] = df['LOCALE'].astype('str')

In [None]:
num_cat = ['TOTAL STUDENTS', 'ASIAN', 'HISPANIC', 'BLACK', 'WHITE']
scl = StandardScaler()

for col in num_cat:
    df[col] = scl.fit_transform(df[[col]].values)


In [None]:
df.head()

In [None]:
df.to_csv('final_model_for_tuning.csv')

In [None]:
Y_HI.to_csv('high-poverty-label.csv')

In [None]:
Y_STRICT.to_csv('strict-poverty-label.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split( df, Y_HI, stratify = Y_HI, random_state = 10)

# Train HI-LABEL using LogisticRegression

In [None]:
log_reg = LogisticRegression(solver='lbfgs', max_iter=1000)

In [None]:
log_reg.fit(X_train, y_train)

In [None]:
y_reg = log_reg.predict(X_test)

In [None]:
accuracy_score( y_reg, y_test)

In [None]:
accuracy_score( log_reg.predict(X_train), y_train)

In [None]:
confusion_matrix( y_reg, y_test)

# Train STRICT-LABEL using LogisticRegression

In [None]:
X_S_train, X_S_test, y_s_train, y_s_test = train_test_split( df, Y_STRICT, stratify = Y_STRICT, random_state = 10)

In [None]:
lr = LogisticRegression(solver='lbfgs',max_iter=1000)

In [None]:
lr.fit( X_S_train, y_s_train)

In [None]:
y_s_pred = lr.predict(X_S_test)

In [None]:
accuracy_score( y_s_pred, y_s_test)

In [None]:
accuracy_score( lr.predict(X_S_train), y_s_train)

In [None]:
confusion_matrix( y_s_pred, y_s_test)

# Train HI-LABEL using KNNeighbors

In [None]:
k_clf = KNeighborsClassifier(n_neighbors = 50)

In [None]:
k_clf.fit(X_train, y_train)

In [None]:
y_clf_pred = k_clf.predict( X_test)

In [None]:
accuracy_score( y_clf_pred, y_test)

In [None]:
accuracy_score( k_clf.predict(X_train), y_train)

In [None]:
confusion_matrix( y_clf_pred, y_test)

# Train STRICT-LABEL using KNNeighbors

In [None]:
k_clf_s = KNeighborsClassifier( n_neighbors = 25)

In [None]:
k_clf_s.fit( X_S_train, y_s_train)

In [None]:
y_s_clf_pred = k_clf_s.predict( X_S_test)

In [None]:
accuracy_score(y_s_clf_pred,  y_s_test)

In [None]:
accuracy_score( k_clf_s.predict(X_S_train), y_s_train)

In [None]:
confusion_matrix( y_s_clf_pred, y_s_test)

# Train HI_LABEL using Logistic Regression with SGD Training

In [None]:
sgd_model = SGDClassifier( loss = 'log', penalty = 'l2', max_iter = 1000)

In [None]:
sgd_model.fit( X_train, y_train)

In [None]:
y_sgd_pred = sgd_model.predict(X_test)

In [None]:
accuracy_score( y_sgd_pred, y_test)

In [None]:
accuracy_score( sgd_model.predict(X_train), y_train)

In [None]:
confusion_matrix( y_sgd_pred, y_test)

# Train STRICT-LABEL using LogReg with SGD Training

In [None]:
sgd_model = SGDClassifier( loss = 'log', penalty = 'l2', max_iter = 1000)

In [None]:
sgd_model.fit( X_S_train, y_s_train)

In [None]:
y_sgd_pred = sgd_model.predict(X_S_test)

In [None]:
accuracy_score( y_sgd_pred, y_s_test)

In [None]:
accuracy_score( sgd_model.predict(X_S_train), y_s_train)

In [None]:
confusion_matrix( y_sgd_pred, y_s_test)

# Random Forest Classifier for HI-LABEL

In [None]:
rf_model = RandomForestClassifier( n_estimators = 1000, min_samples_leaf = 50, random_state = 1 )

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
y_forest_pred = rf_model.predict(X_test)

In [None]:
accuracy_score ( y_forest_pred, y_test)

In [None]:
accuracy_score( rf_model.predict(X_train), y_train)

In [None]:
confusion_matrix( y_forest_pred, y_test)

# Random Forest Classifier for STRICT-LABEL

In [None]:
rfs_model = RandomForestClassifier( n_estimators = 1000, min_samples_leaf = 50, random_state = 1 )

In [None]:
rfs_model.fit(X_S_train, y_s_train)

In [None]:
ys_forest_pred = rfs_model.predict(X_S_test)

In [None]:
accuracy_score ( ys_forest_pred, y_s_test)

In [None]:
accuracy_score( rfs_model.predict(X_S_train), y_s_train)

In [None]:
confusion_matrix( ys_forest_pred, y_s_test)

## We will be focusing on RandomForestClassifier and KNNeighbors to finetune our model.