In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#imports
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
pd.set_option('max_rows',1000)
sns.set(rc={'figure.figsize':(15,10)})
from scipy.special import boxcox1p
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import cross_val_score,cross_val_predict
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix,precision_score, recall_score,f1_score,roc_auc_score,roc_curve,accuracy_score

In [None]:
df = pd.read_csv('/kaggle/input/glass/glass.csv')
df.head()

**Feature information for the dataset**


RI: refractive index

Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)

Mg: Magnesium

Al: Aluminum

Si: Silicon

K: Potassium

Ca: Calcium

Ba: Barium

Fe: Iron

Type of glass: (class attribute)

-- 1 buildingwindowsfloatprocessed

-- 2 buildingwindowsnonfloatprocessed

-- 3 vehiclewindowsfloatprocessed

-- 4 vehiclewindowsnonfloatprocessed (none in this database)

-- 5 containers

-- 6 tableware

-- 7 headlamps


In [None]:
df.info()

In [None]:
df1 = df.copy()

# EXPLANATORY DATA ANALYSIS

In [None]:
plt.figure()
sns.heatmap(df1.corr(),annot=True)
plt.show()

In [None]:
plt.figure()
df1.hist(figsize=(20,20))
plt.show()

In [None]:
plt.figure()
df1['Type'].value_counts().plot(kind='bar')
plt.show()

In [None]:
g1 = df1.groupby('Type')['RI'].mean()
axis = sns.barplot(g1.index, g1,palette='rocket')
axis.set_title('RI vs Type')
axis.set_ylabel('RI')
axis.set_xlabel('Type')
plt.xticks(rotation = 90)

No correlation can be found.

In [None]:
g2 = df1.groupby('Type')['Na'].mean()
axis = sns.barplot(g2.index, g2,palette='magma_r')
axis.set_title('Na vs Type')
axis.set_ylabel('Na')
axis.set_xlabel('Type')
plt.xticks(rotation = 90)
plt.show()

In [None]:
g3 = df1.groupby('Type')['Mg'].mean()
axis = sns.barplot(g3.index, g3,palette='winter')
axis.set_title('Mg vs Type')
axis.set_ylabel('Mg')
axis.set_xlabel('Type')
plt.xticks(rotation = 90)
plt.show()

In [None]:
g4 = df1.groupby('Type')['Al'].mean()
axis = sns.barplot(g4.index, g4,palette='summer_r')
axis.set_title('Al vs Type')
axis.set_ylabel('Al')
axis.set_xlabel('Type')
plt.xticks(rotation = 90)
plt.show()

In [None]:
g5 = df1.groupby('Type')['Ba'].mean()
axis = sns.barplot(g5.index, g5,palette='autumn_r')
axis.set_title('Ba vs Type')
axis.set_ylabel('Ba')
axis.set_xlabel('Type')
plt.xticks(rotation = 90)
plt.show()

In [None]:
g6 = df1.groupby('Type')['Fe'].mean()
axis = sns.barplot(g6.index, g6,palette='magma')
axis.set_title('Fe vs Type')
axis.set_ylabel('Fe')
axis.set_xlabel('Type')
plt.xticks(rotation = 90)
plt.show()

# ML MODELLING

In [None]:
y = df1['Type']
X = df1.drop('Type',axis=1)
print(X.shape,y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42,shuffle=True)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#models
rf = RandomForestClassifier(n_estimators=1000,n_jobs=-1,max_leaf_nodes=15)
xgb = XGBClassifier(n_jobs=-1)

In [None]:
cross_val_score(rf, X_train, y_train, cv=5, scoring="accuracy")

In [None]:
def metrics(model,X_train,y_train):
    y_train_pred = cross_val_predict(model, X_train, y_train, cv=5)
    print('###################')
    print('Confusuion matrix:')
    print(confusion_matrix(y_train, y_train_pred))
    print('###################')
    print('precision score:')
    print(precision_score(y_train, y_train_pred,average='weighted'))
    print('###################')
    print('recall score:')
    print(recall_score(y_train, y_train_pred,average='weighted'))
    print('###################')
    print('f1 score:')
    print(f1_score(y_train, y_train_pred,average='weighted'))
    
metrics(rf,X_train,y_train)

In [None]:
rf.fit(X_train,y_train)
print(accuracy_score(rf.predict(X_train),y_train))
print(accuracy_score(rf.predict(X_test),y_test))
for name,score in zip(X,rf.feature_importances_):
    print(name,score)

In [None]:
cross_val_score(xgb, X_train, y_train, cv=5, scoring="accuracy")

In [None]:
metrics(xgb,X_train,y_train)

In [None]:
xgb.fit(X_train,y_train)
pred = xgb.predict(X_test)
print(accuracy_score(xgb.predict(X_train),y_train))
print(accuracy_score(pred,y_test))

In [None]:
output = pd.DataFrame({'Actual': y_test, 'Predicted': pred})
output.head()

In [None]:
output.to_csv('Predictions.csv',index=False)