In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!ls

In [None]:
import cudf as pd
import cupy as np
import matplotlib.pyplot as plt
from cuml.model_selection import train_test_split
from tqdm import tqdm
from keras.preprocessing import image

%matplotlib inline

In [None]:
train = pd.read_csv('../input/state-farm-distracted-driver-detection/driver_imgs_list.csv')    # reading the csv file
train.head() 

# Image dataset loading  
### For ML models we put target image size as 64x64 across 3 channels (R,G,B) and flatten the matrix to give 1D array which ML Models expects.
### Image api of Keras is used for dataset loading.

In [None]:
train_image = []
for i in tqdm(range(train.shape[0])):
    img = image.load_img('../input/state-farm-distracted-driver-detection/imgs/train/'+train["classname"][i]+"/"+train["img"][i],target_size=(64,64,3))
    img = image.img_to_array(img).flatten()
    img = img/255
    train_image.append(img)
X = np.array(train_image)

## Encoding Classnames

In [None]:
factor = pd.factorize(train['classname'])
y = factor[0]
definitions = factor[1]
print(y)
print(definitions)

## Checking for class Imbalance in dataset


In [None]:
print(train['classname'].value_counts())
pd.DataFrame(train['classname'].value_counts()).to_pandas().plot(kind='bar')

# Image Quality Assessment using Brisque Score

In [None]:
from libsvm import svmutil
!pip install pybrisque
from brisque import *

In [None]:
brisq = BRISQUE()

In [None]:
from tqdm import tqdm
scores=[]
l=[]
for i in tqdm(range(train.shape[0])):
    temp=brisq.get_score('../input/state-farm-distracted-driver-detection/imgs/train/'+train["classname"][i]+"/"+train["img"][i])
    l.append((train["img"][i],temp))
    scores.append(temp)

In [None]:
import statistics
statistics.mean(scores)

##### since BRISQUE score is less so dataset images are of high quality.

In [None]:
import matplotlib.pyplot as plt
plt.hist(scores)

In [None]:
X.shape

# Train-Test Split 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)

In [None]:
from cuml.naive_bayes import GaussianNB
from cuml.linear_model import LogisticRegression
from cuml.svm import SVC

# Logistic Regression

In [None]:
clf_lr = LogisticRegression()
clf_lr.fit(X_train, y_train)
import cuml
preds= clf_lr.predict(X_test)
cu_score = cuml.metrics.accuracy_score( y_test, preds )
print(cu_score)

# Gaussian Naive Bias

In [None]:
clf_gnb = GaussianNB()
clf_gnb.fit(X_train, y_train)
import cuml
preds= clf_gnb.predict(X_test)
cu_score = cuml.metrics.accuracy_score( y_test, preds )
print(cu_score)

# Support Vector Classifier

In [None]:
clf_svc = SVC(probability=True)
clf_svc.fit(X_train, y_train)

# Predicting Probability of Each Class

In [None]:
preds_prob= clf_svc.predict_proba(X_test)

In [None]:
preds_prob[0]

# Predicting Best class for Accuracy metric

In [None]:
import cuml
preds= clf_svc.predict(X_test)
cu_score = cuml.metrics.accuracy_score( y_test, preds )
print(cu_score)

In [None]:
preds

## Calculating Confusion Metric

In [None]:
from cuml.metrics import confusion_matrix
cm=confusion_matrix(y_test.astype("int32"),preds.astype("int32"))

In [None]:
import seaborn as sns
import cupy as np
sns.set(font_scale=1.0)
sns.heatmap(np.asnumpy(cm),annot=True, cmap='Blues',fmt='g')

## XGBoost ,CatBoost, LightGbm, Random Forest & their Ensemble

In [None]:
import xgboost as xgb
import cuml
xgb_clf = xgb.XGBClassifier(use_label_encoder=False,tree_method='gpu_hist')
xgb_clf.fit(X_train, y_train)
preds_prob_xgb=xgb_clf.predict_proba(X_test)
preds= xgb_clf.predict(X_test)
cu_score = cuml.metrics.accuracy_score( y_test, preds )
print(cu_score)

In [None]:
from catboost import CatBoostClassifier
cgb_clf = CatBoostClassifier(iterations=500,learning_rate =0.01,
                           task_type="GPU",metric_period=100,
                           random_seed=42)
cgb_clf.fit(np.asnumpy(X_train),np.asnumpy(y_train))
preds_prob_cgb=cgb_clf.predict_proba(np.asnumpy(X_test))
preds= cgb_clf.predict(np.asnumpy(X_test))
cu_score = cuml.metrics.accuracy_score( y_test, preds )
print(cu_score)

In [None]:
import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(boosting_type='dart',learning_rate=0.18, max_depth=7,
               n_estimators=450,objective='binary',device='gpu',
               random_state=42)
lgb_clf.fit(np.asnumpy(X_train),np.asnumpy(y_train))
preds_prob_lgb=lgb_clf.predict_proba(np.asnumpy(X_test))
preds= lgb_clf.predict(np.asnumpy(X_test))
cu_score = cuml.metrics.accuracy_score( y_test, preds )
print(cu_score)

In [None]:
from cuml.ensemble import RandomForestClassifier
rdf_clf=RandomForestClassifier(n_estimators=600,random_state=42, verbose=0,warm_start=False)
rdf_clf.fit(X_train, y_train)
preds_prob_rdf=rdf_clf.predict_proba(X_test)
preds= rdf_clf.predict(X_test)
cu_score = cuml.metrics.accuracy_score( y_test, preds )
print(cu_score)

# Ensemble
#### *Note - had memory allocation problems in ensembling can be implemented as below on gpu with greater memory 

In [None]:
from sklearn.ensemble import  VotingClassifier
eclf1 = VotingClassifier(estimators=[('catboost', cgb_clf), ('xgboost', xgb_cl), ('lightgbm', lgb_clf),('randomforest', rdf_clf)], voting='soft',weights=[3,2,3,3],flatten_transform=True)
eclf1 = eclf1.fit(np.asnumpy(X_train),np.asnumpy(y_train))

In [None]:
preds= eclf1.predict(np.asnumpy(X_test))
cu_score = cuml.metrics.accuracy_score( y_test, preds )
print(cu_score)

In [None]:
eclf1.predict_proba(np.asnumpy(X_test))[0]

In [None]:
test = pd.read_csv('../input/state-farm-distracted-driver-detection/sample_submission.csv')    # reading the csv file
test.head() 


In [None]:
test_image = []
for i in tqdm(range(test.shape[0])):
    img = image.load_img('../input/state-farm-distracted-driver-detection/imgs/test/'+test["img"][i],target_size=(64,64,3))
    img = image.img_to_array(img).flatten()
    img = img/255
    test_image.append(img)
test_data = np.array(test_image)

In [None]:
preds=eclf1.predict_proba(np.asnumpy(test_data))