In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, accuracy_score, mean_absolute_error, mean_squared_error, roc_curve, auc, confusion_matrix, classification_report

from imblearn.over_sampling import RandomOverSampler

In [None]:
!ls ../input/ozone-level-detection

In [None]:
data=pd.read_csv("../input/ozone-level-detection/eighthr.data.csv", header=None)
data

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.drop([0], axis=1, inplace=True)
data

In [None]:
data.describe()

In [None]:
for i in data.columns:
    data[i]=data[i].replace(["?"],np.nan)

In [None]:
data.isna().sum()

In [None]:
data

In [None]:
for i in data.columns[:-1]:
    data[i]=data[i].astype(str).astype(float)

In [None]:
for i in data.columns:
    if data[i].isnull().sum()>0:
        data[i]=data[i].ffill()

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data[73].value_counts()

In [None]:
x=data.drop([73], axis=1)
y=data[73]

In [None]:
oversample = RandomOverSampler()
xnew, ynew= oversample.fit_resample(x, y)

In [None]:
ynew.value_counts()

In [None]:
new_data=pd.DataFrame(data=xnew, columns=xnew.columns)
new_data[73]=ynew
new_data

In [None]:
corr=new_data.corr()
plt.subplots(figsize=(30, 30))
sns.heatmap(corr, annot=True)

In [None]:
corr[73].sort_values(ascending=False)

In [None]:
def get_corelated_col(cor_dat, threshold):
    feature=[]
    value=[]
    for i ,index in enumerate(cor_dat.index):
        if cor_dat[index] > threshold:
            feature.append(index)
            value.append(cor_dat[index])
    df = pd.DataFrame(data = value, index = feature, columns=['corr value'])
    return df
            
top_corelated_values = get_corelated_col(corr[73], 0)
top_corelated_values

In [None]:
final = new_data[top_corelated_values.index]
final.shape

In [None]:
x=final.drop(73, axis=1)

scaler = StandardScaler()
x =scaler.fit_transform(x)

y=final[73]
y

In [None]:
xtrain, xtest, ytrain, ytest= train_test_split(x, y, test_size=0.25, random_state=42)

In [None]:
reg= KNeighborsClassifier(n_neighbors=6)
reg.fit(xtrain, ytrain)

In [None]:
ypredict=reg.predict(xtest)

In [None]:
print("Mean Absolute Error is {} ".format(mean_absolute_error(ytest, ypredict)))
print("Mean Squared Error is {} ".format(mean_squared_error(ytest, ypredict)))

In [None]:
fpr, tpr, threshold= roc_curve(ytest, ypredict, pos_label=1)
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC curve")
plt.show()
print("AUC value is {} ".format(auc(fpr, tpr)))

In [None]:
print("Accuracy score of the predictions: {}%".format(accuracy_score(ypredict, ytest)*100))

In [None]:
# Confusion matrix

confu = confusion_matrix(ytest, ypredict, labels = [0,1])
sns.heatmap(confu, annot=True)

In [None]:
print("Classification Report for our model is ")
print(classification_report(ytest, ypredict))

# Applying Random Forest Regressor

In [None]:
rf= RandomForestRegressor(n_estimators=500)
rf.fit(xtrain, ytrain)
ypredict=rf.predict(xtest)
r2_score(ytest, ypredict)

In [None]:
#Finding Error Score 
print("Mean Absolute Error is {} ".format(mean_absolute_error(ytest, ypredict)))
print("Mean Squared Error is {} ".format(mean_squared_error(ytest, ypredict)))

In [None]:
#Plotting AUC curve
fpr, tpr, threshold= roc_curve(ytest, ypredict, pos_label=1)
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC curve")
plt.show()
print("AUC value is {} ".format(auc(fpr, tpr)))