In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.metrics import accuracy_score

## Load data

In [13]:
df_x_train = pd.read_csv("/Users/romulo/Documents/Dataset/Earthquake Damage/train_values.csv",index_col="building_id")
df_y_train = pd.read_csv("/Users/romulo/Documents/Dataset/Earthquake Damage/train_labels.csv",index_col="building_id")
df_x_test = pd.read_csv("/Users/romulo/Documents/Dataset/Earthquake Damage/test_values.csv",index_col="building_id")

df_x_train = df_x_train[:100]
df_y_train = df_y_train[:100]
df_x_test = df_x_test[:100]

data_train = df_x_train.merge(df_y_train, how='left', left_index=True, right_index=True)



print("len train:",len(data_train))
print("len train:",len(df_x_test))

len train: 100
len train: 100


## Preparate data

In [14]:
# get y train
y_train = data_train['damage_grade'].values

# remove y of data_train
data_train = data_train.drop('damage_grade', 1)

In [15]:
# let's put the train data and test data together to make get_dummies and then divide
df_x_all = data_train.append(df_x_test)
print("len all:",len(df_x_all))

# get dummies from cat columns
cat_var = [key for key in dict(df_x_all.dtypes) if dict(df_x_all.dtypes)[key] in ['object'] ]
df_x_all = pd.get_dummies(df_x_all, prefix=cat_var, columns=cat_var)

#divide x_train and x_test
x_train = df_x_all.iloc[:len(data_train)]
x_test = df_x_all.iloc[len(data_train):]

len all: 200


In [16]:
# clear memory
del df_x_all,df_x_train,df_y_train,data_train,df_x_test

In [17]:
#get x_dev and y_dev (10% from train)
x_train, x_dev, y_train, y_dev = train_test_split( x_train, y_train, test_size=0.1, random_state=42)

In [18]:
print("len x_train : %d  len y_train: %d " %(len(x_train),len(y_train)) )
print("len x_dev   : %d  len y_dev  : %d " %(len(x_dev),len(y_dev)) )
print("len x_test  : %d" %(len(x_test)) )

len x_train : 90  len y_train: 90 
len x_dev   : 10  len y_dev  : 10 
len x_test  : 100


## Testing Algorithms from sklearn

In [19]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(10),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=8),
    RandomForestClassifier(max_depth=8, n_estimators=1500, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [20]:
df_results_sklearn = pd.DataFrame(columns=["algorithm","acc_train","acc_dev","precision_1","recall_1","f1-score_1","precision_2","recall_2","f1-score_2","precision_3","recall_3","f1-score_3"])
for name, clf in zip(names, classifiers):
        print(name)
        clf.fit(x_train, y_train)
        score_train = clf.score(x_train, y_train)
        score_dev = clf.score(x_dev, y_dev)
        print("\tscore_train: ",score_train)
        print("\tscore_dev: ",score_dev)
        print("\n\n")
        
        pred_y_pred = clf.predict(x_dev)
        
        dict_report = classification_report(y_dev, pred_y_pred,output_dict=True)
        precision_1 = dict_report["1"]["precision"]
        precision_2 = dict_report["2"]["precision"]
        precision_3 = dict_report["3"]["precision"]
        
        recall_1 = dict_report["1"]["recall"]
        recall_2 = dict_report["2"]["recall"]
        recall_3 = dict_report["3"]["recall"]
        
        score_1 = dict_report["1"]["f1-score"]
        score_2 = dict_report["2"]["f1-score"]
        score_3 = dict_report["3"]["f1-score"]
        
        df_results_sklearn = df_results_sklearn.append({"algorithm": name,"acc_train":score_train,"acc_dev":score_dev,"precision_1" : precision_1, "recall_1" : recall_1, "f1-score_1" : score_1,"precision_2" : precision_2, "recall_2" : recall_2, "f1-score_2" : score_2,"precision_3" : precision_3, "recall_3" : recall_3, "f1-score_3" : score_3} , ignore_index=True)
        df_results_sklearn.to_csv("results/df_results_sklearn_alg.csv")
        

Nearest Neighbors
	score_train:  0.6111111111111112
	score_dev:  0.5



Linear SVM


  'precision', 'predicted', average, warn_for)


	score_train:  0.6888888888888889
	score_dev:  0.4



RBF SVM
	score_train:  1.0
	score_dev:  0.5



Gaussian Process
	score_train:  1.0
	score_dev:  0.3



Decision Tree
	score_train:  1.0
	score_dev:  0.4



Random Forest


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


	score_train:  0.9666666666666667
	score_dev:  0.6



Neural Net
	score_train:  0.5888888888888889
	score_dev:  0.5



AdaBoost
	score_train:  0.4666666666666667
	score_dev:  0.3



Naive Bayes
	score_train:  0.6888888888888889
	score_dev:  0.3



QDA
	score_train:  0.6888888888888889
	score_dev:  0.5





  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Using XGBoost

In [21]:
# My gridsearch (greedy)

df_results_XGBoost = pd.DataFrame(columns=["acc_train","acc_dev","max_depth","n_estimator","precision_1","recall_1","f1-score_1","precision_2","recall_2","f1-score_2","precision_3","recall_3","f1-score_3"])

max_depth = 6
n_estimators =  range(100,2100,100)

for n_estimator in n_estimators:
    print("n_estimators:",n_estimator)

    model = xgb.XGBClassifier(n_estimators=n_estimator,max_depth=max_depth)
    model.fit(x_train, y_train)

    y_train_pred = model.predict(x_train)
    y_dev_pred = model.predict(x_dev)
    

    accuracy_train = accuracy_score(y_train, y_train_pred)
    accuracy_dev = accuracy_score(y_dev, y_dev_pred)
    
    pred_y_pred = clf.predict(x_dev)
        
    dict_report = classification_report(y_dev, pred_y_pred,output_dict=True)
    precision_1 = dict_report["1"]["precision"]
    precision_2 = dict_report["2"]["precision"]
    precision_3 = dict_report["3"]["precision"]

    recall_1 = dict_report["1"]["recall"]
    recall_2 = dict_report["2"]["recall"]
    recall_3 = dict_report["3"]["recall"]

    score_1 = dict_report["1"]["f1-score"]
    score_2 = dict_report["2"]["f1-score"]
    score_3 = dict_report["3"]["f1-score"]
    
    df_results_XGBoost = df_results_XGBoost.append({"acc_train":accuracy_train,"acc_dev":accuracy_dev,"max_depth":max_depth, "n_estimator":n_estimator, "precision_1" : precision_1, "recall_1" : recall_1, "f1-score_1" : score_1,"precision_2" : precision_2, "recall_2" : recall_2, "f1-score_2" : score_2,"precision_3" : precision_3, "recall_3" : recall_3, "f1-score_3" : score_3} , ignore_index=True)
    df_results_XGBoost.to_csv("results/df_results_XGBoost.csv")
    del model
    print("\taccuracy_train",accuracy_train)
    print("\taccuracy_dev",accuracy_dev)
        

n_estimators: 100
	accuracy_train 1.0
	accuracy_dev 0.6
n_estimators: 200


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 300


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 400


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 500


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 600


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 700


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 800


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 900


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 1000


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 1100


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 1200


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 1300


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 1400


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 1500


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 1600


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 1700


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 1800


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 1900


  'precision', 'predicted', average, warn_for)


	accuracy_train 1.0
	accuracy_dev 0.5
n_estimators: 2000
	accuracy_train 1.0
	accuracy_dev 0.5


  'precision', 'predicted', average, warn_for)
