In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("../input/traincsv/train.csv")
features = pd.read_csv("../input/feature/log_feature.csv")
resources = pd.read_csv("../input/resources/resource_type.csv")
severity = pd.read_csv("../input/severity-type/severity_type.csv")
event = pd.read_csv("../input/eventcsv/event_type.csv")
test = pd.read_csv("../input/test-csv/test.csv")

Train Data Preparation

In [None]:
tabla1 = train.merge(severity, how = 'left', left_on='id', right_on='id')
tabla2 = tabla1.merge(resources, how = 'left', left_on='id', right_on='id')
tabla3 = tabla2.merge(features, how = 'left', left_on='id', right_on='id')
tabla4 = tabla3.merge(event, how = 'left', left_on='id', right_on='id')

In [None]:
tabla4.head(20)

In [None]:
tabla4.shape

In [None]:
#To eliminate duplicates in train dataset
tabla4.drop_duplicates(subset= 'id', keep= 'first', inplace = True)

In [None]:
tabla4.shape

In [None]:
tabla4.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize = (14,6))
sns.countplot(tabla4['severity_type'])
plt.show()

In [None]:
tabla4.fault_severity.value_counts()

In [None]:
sns.countplot(train['fault_severity'])

In [None]:
crosstab_severity = pd.crosstab(tabla4['fault_severity'],tabla4['severity_type'],margins = False)
print(crosstab_severity)

In [None]:
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
tabla4['location_lab'] = lb.fit_transform(tabla4['location'])
tabla4['severity_type_lab'] = lb.fit_transform(tabla4['severity_type'])
tabla4['resource_type_lab'] = lb.fit_transform(tabla4['resource_type'])
tabla4['log_feature_lab'] = lb.fit_transform(tabla4['log_feature'])
tabla4['event_type_lab'] = lb.fit_transform(tabla4['event_type'])

y = tabla4['fault_severity']
X = tabla4.drop(['fault_severity', 'location', 'severity_type', 'resource_type', 'log_feature', 'event_type'], axis = 1)

X = X.set_index(X.id).drop('id',axis = 1)
X.head(5)

In [None]:
#Train data set 85% and test 15%.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=101)

Test dataset preparation

In [None]:
test1 = test.merge(severity, how = 'left', left_on='id', right_on='id')
test2 = test1.merge(resources, how = 'left', left_on='id', right_on='id')
test3 = test2.merge(features, how = 'left', left_on='id', right_on='id')
test4 = test3.merge(event, how = 'left', left_on='id', right_on='id')

In [None]:
#To eliminate duplicates in train dataset
test4.drop_duplicates(subset= 'id', keep= 'first', inplace = True)

In [None]:
test4['location_lab'] = lb.fit_transform(test4['location'])
test4['severity_type_lab'] = lb.fit_transform(test4['severity_type'])
test4['resource_type_lab'] = lb.fit_transform(test4['resource_type'])
test4['log_feature_lab'] = lb.fit_transform(test4['log_feature'])
test4['event_type_lab'] = lb.fit_transform(test4['event_type'])

X_test_OK = test4.drop(['location', 'severity_type', 'resource_type', 'log_feature', 'event_type'], axis = 1)

X_test_OK = X_test_OK.set_index(X_test_OK.id).drop('id',axis = 1)
X_test_OK.head(5)

Random Forest

In [None]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


In [None]:
# First create the base model to tune
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

rf = RandomForestClassifier()
model_rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
model_rf_random.fit(X_train,y_train)


In [None]:
model_rf_random.best_params_


In [None]:
best_random = model_rf_random.best_estimator_

pred2 = best_random.predict(X_test)

print('Confusion matrix \n')
print(metrics.confusion_matrix(y_test,pred2))
print('Classification report \n')
print(metrics.classification_report(y_test,pred2))

In [None]:
# predict a multinomial probability distribution
submision = best_random.predict_proba(X_test_OK)


In [None]:
submision

In [None]:
pred_df=pd.DataFrame(submision,columns=['predict_0', 'predict_1', 'predict_2'])
submission_rm=pd.concat([pd.DataFrame(X_test_OK.index),pred_df],axis=1)
submission_rm.to_csv('submision_1.csv',index=False,header=True)

In [None]:
X.dtypes

CatBoost

In [None]:
X.iloc[:,1:6] = X.iloc[:,1:6].astype('category')

In [None]:
#Train data set 85% and test 15%.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=101)

In [None]:
from catboost import CatBoostClassifier as cbc
from sklearn.metrics import accuracy_score, log_loss

cat_b = cbc(objective = 'MultiClass', learning_rate = 0.1, n_estimators = 100, random_state = 1)
cat_b = cat_b.fit(X = X_train, y = y_train, eval_set = (X_test, y_test), 
                 cat_features = np.where(X_train.dtypes != np.int64)[0], 
                 early_stopping_rounds = 50, verbose = 10)

In [None]:
y_pred_cat = cat_b.predict(X_test)

In [None]:
val_acc_cat = accuracy_score(y_test,y_pred_cat)
val_acc_cat

In [None]:
#Submission2
#predict a multinomial probability distribution
submision2 = cat_b.predict_proba(X_test_OK)

In [None]:
pred_df2=pd.DataFrame(submision2,columns=['predict_0', 'predict_1', 'predict_2'])
submission_cat=pd.concat([pd.DataFrame(X_test_OK.index),pred_df2],axis=1)
submission_cat.to_csv('submision_2.csv',index=False,header=True)

In [None]:
#CatBoost with Hypertunning Parameters

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Create the random grid
random_grid2 = {'n_estimators': n_estimators,
               'max_depth': randint(3, 10)}
print(random_grid2)

In [None]:
from catboost import CatBoostClassifier as cbc

cat_b2 = cbc(objective = 'MultiClass')
model_catb2_random = RandomizedSearchCV(estimator = cat_b2, param_distributions = random_grid2, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
model_catb2_random.fit(X_train,y_train, cat_features = np.where(X_train.dtypes != np.int64)[0])

In [None]:
from sklearn.metrics import accuracy_score, log_loss

y_pred_cat2 = model_catb2_random.predict(X_test)
val_acc_cat2 = accuracy_score(y_test,y_pred_cat2)
val_acc_cat2

In [None]:
model_catb2_random.best_params_

In [None]:
best_random2 = model_catb2_random.best_estimator_

# predict a multinomial probability distribution
submision3 = best_random2.predict_proba(X_test_OK)

pred_df3=pd.DataFrame(submision3,columns=['predict_0', 'predict_1', 'predict_2'])
submission_cat2=pd.concat([pd.DataFrame(X_test_OK.index),pred_df3],axis=1)
submission_cat2.to_csv('submision_3.csv',index=False,header=True)