In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data=pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv')
test_data=pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')

In [None]:
X=train_data.drop(labels=['Id','Cover_Type'],axis=1)
y=train_data['Cover_Type']


In [None]:
X.head()

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,  y_test = train_test_split(X, y, test_size = 0.2, random_state=40)
x_train_full = x_train.copy()
x_test_full = x_test.copy()
y_train_full = y_train.copy()
y_test_full = y_test.copy()

In [None]:
x_test.head()

# Removing Constant Columns

In [None]:
from sklearn.feature_selection import VarianceThreshold
var_thres = VarianceThreshold(threshold=0)
var_thres.fit(x_train)
# var_thres.fit(test_data)
x_train.columns[var_thres.get_support()]

constant_columns = [column for column in x_train.columns
                    if column not in x_train.columns[var_thres.get_support()]]

print(len(constant_columns))

for feature in constant_columns:
     print(feature)

x_test.drop(constant_columns,axis=1, inplace=True)
x_train.drop(constant_columns,axis=1, inplace=True)
x_test.head()

In [None]:
x_train = x_train.reset_index()
x_test = x_test.reset_index()
y_train = y_train.reset_index()
y_test = y_test.reset_index()
x_train.drop("index",axis=1, inplace=True)
y_train.drop("index",axis=1, inplace=True)
x_test.drop("index",axis=1, inplace=True)
y_test.drop("index",axis=1, inplace=True)

In [None]:
x_train.head()

# Removing Highly Correlated Features

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = x_train.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.CMRmap_r)
plt.show()

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(train_data, 0.7)
len(set(corr_features))

corr_features

x_train.drop(corr_features,axis=1, inplace=True)
x_test.drop(corr_features,axis=1, inplace=True)

In [None]:
corr_features

# Feature Selection-Information gain - mutual information

In [None]:
from sklearn.feature_selection import mutual_info_classif
# determine the mutual information
mutual_info = mutual_info_classif(x_train, y_train)

mutual_info = pd.Series(mutual_info)
mutual_info.index = x_train.columns
mutual_info.sort_values(ascending=False)

In [None]:

#let's plot the ordered mutual_info values per feature
mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 8))

In [None]:
from sklearn.feature_selection import SelectKBest
#No we Will select the  top 5 important features
sel_five_cols = SelectKBest(mutual_info_classif, k=10)
sel_five_cols.fit(x_train, y_train)
new_col = x_train.columns[sel_five_cols.get_support()]

In [None]:
new_col
x_train = x_train[new_col]
x_test = x_test[new_col]
print(x_train.shape)
print(x_test.shape)

# Fisher Score- Chisquare Test For Feature Selection

In [None]:
## Perform chi2 test
### chi2 returns 2 values
### Fscore and the pvalue
from sklearn.feature_selection import chi2
f_p_values=chi2(x_train,y_train)
p_values=pd.Series(f_p_values[1])
p_values.index=x_train.columns

p_values.sort_index(ascending=False)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=70)
rfc.fit(x_train,y_train)
rfc.score(x_test,y_test)

# XGBRegressor

In [None]:
from xgboost import XGBClassifier
my_model = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(x_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(x_test, y_test)], 
             verbose=False)
my_model.score(x_test, y_test)

In [None]:
my_model_full = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model_full.fit(x_train_full, y_train_full, 
             early_stopping_rounds=5, 
             eval_set=[(x_test_full, y_test_full)], 
             verbose=False)
my_model_full.score(x_test_full, y_test_full)

In [None]:
# test_data = test_data[new_col]
# predict=rfc.predict(test_data[new_col])
# Submission=pd.DataFrame(data=predict,columns=['Cover_Type'])
# # Submission.head()
# Submission['Id']=test_data['Id']
# Submission.set_index('Id',inplace=True)
# Submission.head()
test_data_temp = test_data.copy()
predict=my_model_full.predict(test_data.drop(['Id'], axis = 1) )
Submission=pd.DataFrame(data=predict,columns=['Cover_Type'])
# Submission.head()
Submission['Id']=test_data_temp['Id']
Submission.set_index('Id',inplace=True)
Submission.head()


In [None]:
Submission.to_csv('Submission.csv')