In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA

In [2]:
train = pd.read_csv('data/train.csv')
test=pd.read_csv('data/test.csv')

In [12]:
train.columns

Index(['Elevation(meters)', 'Aspect(degrees)', 'Slope(degrees)',
       'Horizontal_Distance_To_Hydrology(meters)',
       'Vertical_Distance_To_Hydrology(meters)',
       'Horizontal_Distance_To_Roadways(meters)', 'Hillshade_9am',
       'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points(meters)', 'Wilderness_Area_1',
       'Wilderness_Area_2', 'Wilderness_Area_3', 'Wilderness_Area_4',
       'Soil_Type_1', 'Soil_Type_2', 'Soil_Type_3', 'Soil_Type_4',
       'Soil_Type_5', 'Soil_Type_6', 'Soil_Type_7', 'Soil_Type_8',
       'Soil_Type_9', 'Soil_Type_10', 'Soil_Type_11', 'Soil_Type_12',
       'Soil_Type_13', 'Soil_Type_14', 'Soil_Type_15', 'Soil_Type_16',
       'Soil_Type_17', 'Soil_Type_18', 'Soil_Type_19', 'Soil_Type_20',
       'Soil_Type_21', 'Soil_Type_22', 'Soil_Type_23', 'Soil_Type_24',
       'Soil_Type_25', 'Soil_Type_26', 'Soil_Type_27', 'Soil_Type_28',
       'Soil_Type_29', 'Soil_Type_30', 'Soil_Type_31', 'Soil_Type_32',
       'Soil_Type_33', 

In [13]:
train['Cover_Type'].value_counts()

5    14165
7    10592
6     1788
1     1025
3      868
2      475
4      137
Name: Cover_Type, dtype: int64

In [17]:
train['Cover_Type'].value_counts().max()/train['Cover_Type'].value_counts().sum()

0.48760757314974185

In [3]:
sc_X = MinMaxScaler()
train_X = sc_X.fit_transform(train.loc[:,train.columns != 'Cover_Type'])
train_y = train.loc[:,train.columns == 'Cover_Type']
test_X = sc_X.transform(test)
cols = train.columns

In [19]:
def return_best_features(features,target,cols):
 bestfeatures = SelectKBest(score_func=chi2, k='all')
 fit = bestfeatures.fit(features,target)
 dfscores = pd.DataFrame(fit.scores_)
 dfcolumns = pd.DataFrame(cols)
 featureScores = pd.concat([dfcolumns,dfscores,pd.DataFrame(bestfeatures.pvalues_)],axis=1)
 featureScores.columns = ['Specs','Score','Pvalue']
 return featureScores

In [20]:
best_features = return_best_features(train_X,train_y,cols)
mean_score = best_features['Score'].describe()['mean']
print(best_features[best_features['Pvalue'] < 0.05].shape)
print(best_features[best_features['Score'] >= mean_score].shape)

(47, 3)
(15, 3)


In [21]:
def find_model(pipeline,param_grid):
    grid = GridSearchCV(pipeline, n_jobs=-1, param_grid=param_grid,scoring=['neg_log_loss','accuracy'],refit='neg_log_loss')
    grid.fit(train_X, train_y)
    print(grid.best_params_)
    print(grid.best_score_)
    print(grid.cv_results_['mean_test_accuracy'].mean())
    return grid

In [22]:
def write_csv(grid):
    result=pd.DataFrame(grid.predict_proba(test_X))
    pd.DataFrame([[1.0,2.0,3.0,4.0,5.0,6.0,7.0]]).append(result).to_csv('submission.csv',index=False,header=False)

In [23]:
pipeline = make_pipeline(SelectKBest(chi2,'all'),RandomForestClassifier(random_state=43))
param_grid ={
    'randomforestclassifier__n_estimators':[100,150,200],
    'selectkbest__k':[15,47,'all']
}
rff_grid = find_model(pipeline,param_grid)

{'randomforestclassifier__n_estimators': 200, 'selectkbest__k': 47}
-0.4004530729093367
0.77184165232358


In [24]:
pipeline = make_pipeline(SelectKBest(chi2,'all'), KNeighborsClassifier())
param_grid ={
    'kneighborsclassifier__n_neighbors':np.arange(8,15),
    'selectkbest__k':[47]
}
knn_grid = find_model(pipeline,param_grid)

{'kneighborsclassifier__n_neighbors': 14, 'selectkbest__k': 47}
-0.7758791133398094
0.790518809933612


In [25]:
pipeline = make_pipeline(SelectKBest(chi2,'all'),GradientBoostingClassifier(random_state=43))
param_grid ={
    'gradientboostingclassifier__n_estimators':[100,150,200],
    'selectkbest__k':[15,47,'all']
}
grad_boosting_grid = find_model(pipeline,param_grid)


{'gradientboostingclassifier__n_estimators': 200, 'selectkbest__k': 'all'}
-0.5366669001440135
0.7162172499521898


In [43]:
pca = PCA(n_components=20)
r = pca.fit(train_X,train_y)
print(pca.explained_variance_ratio_.sum())

0.9077166417676557


In [44]:
pipeline = make_pipeline(PCA(),RandomForestClassifier(random_state=43))
param_grid ={
    'randomforestclassifier__n_estimators':[100,150,200],
    'pca__n_components':[15,20,25]
}
rf_pca_grid = find_model(pipeline,param_grid)

{'pca__n_components': 20, 'randomforestclassifier__n_estimators': 200}
-0.46952047677383507
0.8281162746222988


In [45]:
pipeline = make_pipeline(PCA(),GradientBoostingClassifier(random_state=43))
param_grid ={
    'gradientboostingclassifier__n_estimators':[100,150,200],
    'pca__n_components':[20]
}
grad_boosting_grid = find_model(pipeline,param_grid)

{'gradientboostingclassifier__n_estimators': 200, 'pca__n_components': 20}
-0.6393681906511526
0.7374411933448077


In [46]:
pipeline = make_pipeline(PCA(), KNeighborsClassifier())
param_grid ={
    'kneighborsclassifier__n_neighbors':np.arange(8,15),
    'pca__n_components':[20]
}
knn_grid = find_model(pipeline,param_grid)

{'kneighborsclassifier__n_neighbors': 14, 'pca__n_components': 20}
-0.8237840704496664
0.771526924022621


In [47]:
write_csv(rf_pca_grid)

In [47]:
from keras.models import Sequential
from tensorflow import random
from keras.layers import Dense, Activation

model = Sequential()
model.add(Dense(14, input_dim=20, activation='relu'))
model.add(Dense(10, input_dim=20, activation='sigmoid'))
model.add(Dense(7,activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_25 (Dense)             (None, 14)                294       
_________________________________________________________________
dense_26 (Dense)             (None, 10)                150       
_________________________________________________________________
dense_27 (Dense)             (None, 7)                 77        
Total params: 521
Trainable params: 521
Non-trainable params: 0
_________________________________________________________________
None


In [51]:
pca = PCA(n_components=20)
history = model.fit(x=pca.fit_transform(train_X), y=pd.get_dummies(train_y['Cover_Type']), epochs=20, validation_split=0.3, shuffle=False,verbose=0)

In [52]:
print(history.history['val_accuracy'][len(history.history['val_accuracy'])-1])
print(history.history['accuracy'][len(history.history['accuracy'])-1])
print(history.history['val_loss'][len(history.history['val_loss'])-1])
print(history.history['loss'][len(history.history['loss'])-1])


0.7161216139793396
0.7264814376831055
0.664169430732727
0.6431282162666321


In [53]:
result = model.predict(pca.fit_transform(test_X))


In [54]:
pd.DataFrame([[1.0,2.0,3.0,4.0,5.0,6.0,7.0]]).append(pd.DataFrame(result)).to_csv('submission.csv',index=False,header=False)