In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
glass=pd.read_csv("../input/glass/glass.csv")

In [None]:
glass.head()

In [None]:
glass.isnull().sum()

In [None]:
glass["Type"].value_counts().plot(kind='bar',color=['Red','Pink','LightBlue','Purple','Brown','Green',])
plt.title("Glass types & counts")

# There are 6 types of glass with glass type 2 being the most common one. 

In [None]:

plt.subplots(12)
plt.subplot(121)
sns.swarmplot(x='Type',y='RI',data=glass)
plt.title("Type vs RI")
plt.subplot(122)
sns.distplot(glass['RI'])
plt.title("RI Distribution")

In [None]:

plt.subplots(12)
plt.subplot(121)
sns.swarmplot(x='Type',y='Na',data=glass)
plt.title("Type vs Sodium")
plt.subplot(122)
sns.distplot(glass['Na'])
plt.title("Sodium Distribution")

In [None]:
features=glass.drop('Type',axis=1)

In [None]:
plt.subplots(3,3,figsize=[15,25])
i=1
for feature in features:
    plt.subplot(3,3,i)
    sns.distplot(glass[feature])
    plt.title("Distribution of "+feature)
    i=i+1

In [None]:
plt.subplots(3,3,figsize=[15,25])
i=1
for feature in features:
    plt.subplot(3,3,i)
    sns.boxplot(y=glass[feature],x=glass['Type'])
    plt.title("Distribution of "+feature)
    i=i+1

In [None]:
corr=glass.corr()
sns.heatmap(corr,cmap='viridis',annot=True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
X=features
y=glass['Type']
X_train,X_test,y_train,y_test=train_test_split(X,y)



In [None]:
lr=LogisticRegression()
lr.fit(X_train,y_train)
pred=lr.predict(X_test)
print("Accuracy Score : ", accuracy_score(pred,y_test))
print(classification_report(pred,y_test))
sns.heatmap(confusion_matrix(pred,y_test),annot=True,cmap='viridis')

In [None]:
from sklearn.svm import SVC
svc=SVC(kernel='linear')
svc.fit(X_train,y_train)
pred=svc.predict(X_test)
print("Accuracy Score : ", accuracy_score(pred,y_test))
print(classification_report(pred,y_test))

In [None]:
from sklearn.tree import DecisionTreeClassifier
dc=DecisionTreeClassifier()
dc.fit(X_train,y_train)
pred=dc.predict(X_test)
print("Accuracy Score : ", accuracy_score(pred,y_test))
print(classification_report(pred,y_test))

In [None]:
imp=dc.feature_importances_
pd.DataFrame(imp,index=features.columns,columns=['Feature Importance']).sort_values(by='Feature Importance',ascending=False).plot(kind='barh')


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
pred=rf.predict(X_test)
print(classification_report(pred,y_test))

In [None]:
imp=dc.feature_importances_
pd.DataFrame(imp,index=features.columns,columns=['Feature Importance']).sort_values(by='Feature Importance',ascending=False).plot(kind='barh')

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
etc=ExtraTreesClassifier()
etc.fit(X_train,y_train)
pred=etc.predict(X_test)
print("Classification Score : ",classification_report(pred,y_test))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier()
gbc.fit(X_train,y_train)
pred=gbc.predict(X_test)
print("Classification Score : ",classification_report(pred,y_test))

In [None]:
estimators=range(50,100)
max_features = ['auto', 'sqrt']
max_depth = range(4,15)
min_samples_split = range(2,8)
min_samples_leaf = range(1,8)
bootstrap = [True, False]
criterion=['gini','entropy']
random_grid = {'n_estimators':estimators,
              'max_features':max_features,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf,
              'bootstrap':bootstrap,
              'criterion':criterion}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rf=RandomForestClassifier()
rf=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,cv=5,verbose=1)
rf.fit(X_train,y_train)

In [None]:
best_estimate=rf.best_params_
best_estimate

In [None]:
rf2=RandomForestClassifier(**best_estimate)
rf2.fit(X_train,y_train)

In [None]:
y_pred=rf2.predict(X_test)
print(classification_report(y_pred,y_test))
sns.heatmap(confusion_matrix(y_pred,y_test),annot=True)

# 73% accuracy through random forest classifier