In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import missingno as msno

# Let's be rebels and ignore warnings for now
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize

# Machine learning
import catboost
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier, Pool, cv


pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [None]:
train=pd.read_csv('../input/forest-cover-type-prediction/train.csv')
test=pd.read_csv('../input/forest-cover-type-prediction/test.csv')
test1=pd.read_csv('../input/forest-cover-type-prediction/test3.csv')
submission=pd.read_csv('../input/forest-cover-type-prediction/sampleSubmission.csv')

In [None]:
train.head()

In [None]:
train.describe()

# Data Analysis

1. Missing Values (There are no missing values)
2. All The Numerical Variables
3. Distribution of the Numerical Variables
4. Categorical Variables
5. Cardinality of Categorical Variables
6. Outliers
7. Relationship between independent and dependent feature(Cover_Type)

## Missing Values (There are no missing Values)

In [None]:
# By the Below figure we can understand that there are no Null Values
fig=plt.figure(figsize=(20,5))
sns.heatmap(train.isna())
plt.show()

In [None]:
# Need to figure out the difference between test and test1
print(train.shape)
print(test.shape)
print(test1.shape)

In [None]:
test1.head()

## All numerical Variables

In [None]:
numerical_features=[feature for feature in train.columns if train[feature].dtype!='O' and 'Soil' not in feature and 'Wilderness' not in feature]
# All are numerical variables
print(numerical_features) 
print(len(numerical_features))
print(train.shape)

### Relationship between features and Cover_Type

In [None]:
train.drop('Id',axis=1,inplace=True)
numerical_features=numerical_features[1:]

In [None]:
fig=plt.figure(figsize=(20,5))
sns.heatmap(train[numerical_features].corr(),annot=True)
plt.show()

### Distribution Of Numerical Features

In [None]:
# Feature Distribution of the training data set.
i=0
fig=plt.figure(figsize=(20,20))
for feature in numerical_features:
    i+=1
    plt.subplot(5, 3, i)
    sns.distplot(train[feature])
plt.show()
    
    

In [None]:
train.skew()

In [None]:
train.kurtosis()

In [None]:
def fit_ml_algo(algo,x_train,y_train,cv):
    
    model=algo.fit(x_train,y_train)
    
    acc=round(model.score(x_train,y_train)*100,2)
    
    train_pred = model_selection.cross_val_predict(algo,x_train,y_train,cv=cv,n_jobs=-1)
    
    acc_cv=round(metrics.accuracy_score(y_train,train_pred)*100,2)
    
    return(train_pred, acc, acc_cv)

In [None]:
x_train.head()

In [None]:
y_train=train['Cover_Type']
x_train.drop(columns=['Cover_Type'],axis=1,inplace=True)

In [None]:
x_train=train[numerical_features]
x_train.drop(columns=['Cover_Type'],axis=1,inplace=True)

In [None]:
x_train.head()

## Logistic Regression

In [None]:
train_pred_log, acc_log, acc_cv_log =  fit_ml_algo( LogisticRegression(), x_train, y_train, 10)

print("Accuracy: "+str(acc_log))

print("Accuracy CV 10-Fold: "+str(acc_cv_log))

## KNN

In [None]:
train_pred_knn, acc_knn, acc_cv_knn =  fit_ml_algo( KNeighborsClassifier(), x_train, y_train, 10)

print("Accuracy: "+str(acc_knn))

print("Accuracy CV 10-Fold: "+str(acc_cv_knn))

## Gaussian Naive Bayes

In [None]:
train_pred_nb, acc_nb, acc_cv_nb =  fit_ml_algo( GaussianNB(), x_train, y_train, 10)

print("Accuracy: "+str(acc_nb))

print("Accuracy CV 10-Fold: "+str(acc_cv_nb))

## Linear Support Vector Machine 

In [None]:
train_pred_svc, acc_svc, acc_cv_svc =  fit_ml_algo( LinearSVC(), x_train, y_train, 10)

print("Accuracy: "+str(acc_svc))

print("Accuracy CV 10-Fold: "+str(acc_cv_svc))

## Stochastic Gradient Descent

In [None]:
train_pred_sgd, acc_sgd, acc_cv_sgd=fit_ml_algo(SGDClassifier(),x_train,y_train,10)

print('Accuracy: '+str(acc_sgd))

print('Accuracy CV 10-Fold: '+str(acc_cv_sgd))

## Decision Tree Classifier

In [None]:
train_pred_dt, acc_dt, acc_cv_dt=fit_ml_algo(DecisionTreeClassifier(),x_train,y_train,10)

print('Accuracy: '+str(acc_dt))

print('Accuracy CV 10-Fold: '+str(acc_cv_dt))

## Gradient Boost Trees

In [None]:
train_pred_gbt, acc_gbt, acc_cv_gbt=fit_ml_algo(GradientBoostingClassifier(),x_train,y_train,10)

print('Accuracy: '+str(acc_gbt))

print('Accuracy CV 10-Fold: '+str(acc_cv_gbt))

In [None]:
numerical_features.remove('Cover_Type')
test[numerical_features].head()

In [None]:
gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(x_train, y_train)
predictions = gb_clf2.predict(test[numerical_features])
predictions

In [None]:
submission = pd.DataFrame()
submission['Id'] = test['Id']
submission['Cover_Type'] = predictions # our model predictions on the test dataset
submission.head()

In [None]:
submission.to_csv('submission1.csv', index=False)