In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

import warnings
warnings.filterwarnings('ignore')

# Reading the data.

In [None]:
!ls ../input/liverpool-ion-switching/

In [None]:
train = pd.read_csv("../input/liverpool-ion-switching/train.csv")
test = pd.read_csv("../input/liverpool-ion-switching/test.csv")
sample = pd.read_csv("../input/liverpool-ion-switching/sample_submission.csv")

# Preprocessing.

In [None]:
train.head()

In [None]:
test.head()

In [None]:
sample.head()

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
print("Shape of training data is {}".format(train.shape))
print("Shape of testing data is {}".format(test.shape))
print("Shape of sample data is {}".format(sample.shape))

In [None]:
dupli = train[train.duplicated()]
dupli

In [None]:
print(train['open_channels'].value_counts())
plt.figure(figsize = (8,5))
sns.countplot(x = train['open_channels'])
plt.xlabel('Target', size = 12)
plt.ylabel('Count', size = 12)
plt.title('Distribution in target column', size = 12)

In [None]:
def Outliers(data, ft):
    IQ1 = data[ft].quantile(0.25)
    IQ3 = data[ft].quantile(0.75)
    IQR = IQ3 - IQ1
    
    upper_bound = IQ3 + 1.5 * IQR
    lower_bound = IQ1 - 1.5 * IQR
    
    ind = data.index[ (data[ft] < lower_bound) | (data[ft] > upper_bound) ]
    
    return ind

In [None]:
ind = []
for i in train.columns:
    ind.extend(Outliers(train, i))
print("Total Number of Outliers are: {}".format(len(ind)))

In [None]:
# Making copy of training data to see how our data is affected by dropping outliers.

train1 = train.copy()

In [None]:
print("Shape of training data before dropping outliers is {}".format(train.shape))
train1.drop(ind, inplace = True, axis = 0)
print("Shape of cpoy of training data after dropping outliers is {}".format(train1.shape))

In [None]:
print(train1['open_channels'].value_counts())
plt.figure(figsize = (8,5))
sns.countplot(x = train1['open_channels'])
plt.xlabel('Target', size = 12)
plt.ylabel('Count', size = 12)
plt.title('Distribution in target column after dropping outliers', size = 12)

*So here we can see that by dropping outliers we are loosing important information. So we will not drop Outliers.*

In [None]:
x = train.iloc[:, :-1]
col = x.columns

std = StandardScaler()
x_std = std.fit_transform(x)
x_std = pd.DataFrame(data = x_std, columns = col)

mms = MinMaxScaler()
x_mms = mms.fit_transform(x)
x_mms = pd.DataFrame(data = x_mms, columns = col)

In [None]:
y = train.iloc[:, -1]
y.head()

# Building Models.

In [None]:
def Models(d, model, name, xtrain, xtest, ytrain, ytest):
    cla = model
    print("Working with {}".format(name))
    cla.fit(xtrain, ytrain)
    
    predicted = cla.predict(xtrain)
    train_acu = accuracy_score(predicted, ytrain)
    
    predicted = cla.predict(xtest)
    test_acu = accuracy_score(predicted, ytest)
    
    F1_score = f1_score(predicted, ytest, average = 'macro')
    
    d['Name'].append(name)
    d['Train_acu'].append(train_acu)
    d['Test_acu'].append(test_acu)
    d['F1_score'].append(F1_score)
    print("********"*7)
    return d

In [None]:
l = [x, x_std, x_mms]
name = ['Normal', 'Standard Scaler', 'Min Max Scaler']
final = []
for i in l:
    xtrain, xtest, ytrain, ytest = train_test_split(i, y, test_size = 0.25, random_state = 42)
    
    d = {'Name':[], 'Train_acu':[], 'Test_acu':[], 'F1_score':[]}
    models = [ [RandomForestClassifier(n_estimators = 5), 'Random Forest'], [DecisionTreeClassifier(), 'Decision Tree'],
             [XGBClassifier(tree_method = 'gpu_hist'), 'XGBoost'], [CatBoostClassifier(task_type = 'GPU'), 'CatBoost'],
              [LogisticRegression(), 'Logistic Regression'] ]
    for i in models:
        d = Models(d, i[0], i[1], xtrain, xtest, ytrain, ytest)
    
    final.append(d)

In [None]:
for i in range (3):
    print(name[i])
    acu = pd.DataFrame(data = final[i])
    print(acu)
    print("******************"*3)

*Taking random forest since we are getting best results.*

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.25, random_state = 42)

In [None]:
cla = RandomForestClassifier(n_estimators = 5)
cla.fit(xtrain, ytrain)

# Making predictions on traning data.

In [None]:
x_test = test.loc[:,:]
x_test.head()

In [None]:
predicted = cla.predict(x_test)
predicted

In [None]:
sample.head()

In [None]:
'500.0010' in test['time']

In [None]:
submit = pd.DataFrame(data = {'time':test['time'], 'open_channels':predicted})
submit.head()

In [None]:
submit.to_csv('submission', index = False, float_format='%.4f')