In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import average_precision_score, confusion_matrix, accuracy_score, classification_report, plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from scipy.stats import zscore
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV, KFold, cross_validate, cross_val_predict
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import KFold
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
train_set_path = "../input/stayalert/fordTrain.csv" 
test_set_path = "../input/stayalert/fordTest.csv"

In [None]:
training_dataset = pd.read_csv(train_set_path)
testing_dataset = pd.read_csv(test_set_path)

In [None]:
training_dataset.head()

In [None]:
testing_dataset.head()

In [None]:
testing_dataset = testing_dataset.drop('IsAlert', axis=1)
testing_dataset.head()

In [None]:
training_dataset.shape, testing_dataset.shape

In [None]:
training_dataset.isnull().sum()

In [None]:
testing_dataset.isnull().sum()

In [None]:
training_dataset.info()

In [None]:
training_dataset = training_dataset.drop(['TrialID', 'ObsNum'], axis=1)
training_dataset.head()

In [None]:
testing_dataset = testing_dataset.drop(['TrialID', 'ObsNum'], axis=1)
testing_dataset.head()

In [None]:
corelation_training_o = training_dataset.corr()
fig_t_o, ax_t_o = plt.subplots(figsize=(50,50))
sbn.heatmap(corelation_training_o, annot=True, cmap='Blues', fmt='g', ax=ax_t_o )

In [None]:
training_dataset.skew()

In [None]:
features = training_dataset.iloc[:, 1: ]
targets = training_dataset.iloc[:, :1]

In [None]:
features.shape, targets.shape

In [None]:
features_train, features_test, targets_train, targets_test = train_test_split(features, targets, test_size=0.30, random_state=42)
print("features_train shape: {}".format(features_train.shape))
print("targets_train shape: {}".format(targets_train.shape))
print("features_test shape: {}".format(features_test.shape))
print("targets_test shape: {}".format(targets_test.shape))

In [None]:
with tf.device('/device:GPU:0'):
    feature_selection_model = RandomForestClassifier()
    feature_selection_model.fit(features_train, targets_train)


In [None]:
imp_features_df = pd.DataFrame(feature_selection_model.feature_importances_, index=features_train.columns, columns=['importance']).sort_values(by='importance', ascending=False)
imp_features_df["cummulative_importance"] = np.cumsum(imp_features_df.importance)
most_important = imp_features_df[imp_features_df["cummulative_importance"] <= 0.96]

In [None]:
indexs_df = pd.DataFrame(most_important.index, columns=['Features'])
indexs_df["Index"] = [features_train.columns.get_loc(c) for c in most_important.index]
indexs_df["Importance"] = list(most_important["importance"])
indexs_df["Cumm_Importance"] = list(most_important["cummulative_importance"])
indexs_df

In [None]:
rf_pred = feature_selection_model.predict(features_test)

In [None]:
print("RANDOMFOREST CLASSIFACTION REPORT")
print("\n")
print(classification_report(targets_test, rf_pred))

In [None]:
print("CONFUSION MATRIX FOR RANDOMFOREST")
confusion_matrix_rf = confusion_matrix(targets_test, rf_pred)
confusion_matrix_rf_df = pd.DataFrame(confusion_matrix_rf)
plt.figure(figsize = (10,7))
sbn.heatmap(confusion_matrix_rf_df, annot=True, cmap='Blues', fmt='g')

In [None]:
new_important_features_training_dataset = features_train.iloc[:, list(indexs_df.Index)]
new_important_features_testing_dataset = features_test.iloc[:, list(indexs_df.Index)]

In [None]:
new_important_features_training_dataset.head()

In [None]:
new_important_features_testing_dataset.head()

In [None]:
new_important_features_training_dataset.shape

In [None]:
new_important_features_testing_dataset.shape

In [None]:
with tf.device('/device:GPU:0'):
    new_random_forest = RandomForestClassifier()
    new_random_forest.fit(new_important_features_training_dataset,targets_train)
    print(new_random_forest)

In [None]:
new_rf_pred = new_random_forest.predict(new_important_features_testing_dataset)

In [None]:
print("RANDOMFOREST NEW CLASSIFACTION REPORT")
print("\n")
print(classification_report(targets_test, new_rf_pred))

In [None]:
print("CONFUSION MATRIX FOR RANDOMFOREST")
confusion_matrix_rf_new = confusion_matrix(targets_test, new_rf_pred)
confusion_matrix_rf_new_df = pd.DataFrame(confusion_matrix_rf_new)
plt.figure(figsize = (10,7))
sbn.heatmap(confusion_matrix_rf_new_df, annot=True, cmap='Blues', fmt='g')

In [None]:
important_features_testing_dataset = testing_dataset.iloc[:, list(indexs_df.Index)]
important_features_testing_dataset.head()

In [None]:
important_features_testing_dataset.shape

In [None]:
validation_pred = new_random_forest.predict(important_features_testing_dataset)

In [None]:
pd.DataFrame(validation_pred, columns=['IsAlert']).value_counts()

In [None]:
tid_oid_df = pd.read_csv(test_set_path)
tid_oid_df = tid_oid_df.iloc[:, 0:2]
tid_oid_df['IsAlert'] = validation_pred
tid_oid_df.head()

In [None]:
output_path = './'
tid_oid_df.to_csv("{}submission.csv".format(output_path), index=False)