In [None]:
# Importing necessary libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
data = pd.read_csv("../input/ozone-level-detection/eighthr.data.csv", header=None)
df = pd.DataFrame(data)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df = df.drop([0], axis=1)  # Dropping Date column as it is not useful in prediction and analysis

> Since at some places dataset contains __'?'__ symbol, so we are changing it into __Nan__ value. 

In [None]:
for i in df.columns:
  df[i] = df[i].replace(['?'], np.nan)

In [None]:
df.head()

In [None]:
# showing column wise %ge of NaN values they contains
null_col = []
for i in df.columns:
  print(i,"\t-\t", df[i].isna().mean()*100)
  if df[i].isna().mean()*100 > 0:
    null_col.append(i)

> Since no column has signficant missing values, there is no need to drop column here . Now fill the num values the mean
of columns

In [None]:
# Since dataset contains values in str format, changing it into float values

for i in df.columns[:-1]:
  df[i] = df[i].astype(str).astype(float)

In [None]:
for i in null_col:
  df[i] = df[i].fillna(df[i].mean())

# lets check for null values again
for i in df.columns:
  print(i,"\t-\t", df[i].isna().mean()*100)

In [None]:
# Checking for unbalanced dataset

plt.figure(figsize=(5,5))
ax = sns.countplot(x=73, data=df)

for p in ax.patches:
        ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+50))


In [None]:
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler()
x, y = oversample.fit_resample(df.drop([73], axis=1), df[73])

new_df = pd.DataFrame(x, columns=df.drop([73], axis=1).columns)
new_df[73] = y

new_df.head()



> Here we can see that all the classes are balanced.

>Let's further see how other attributes are related to each other using pairplot

In [None]:
cormap = new_df.corr()
fig, ax = plt.subplots(figsize=(50,50))
sns.heatmap(cormap, annot = True)

In [None]:
# Simple Function to get the name of top most corelated attributes

def get_corelated_col(cor_dat, threshold):
# Cor_data to be column along which corelation to be measured
#Threshold be the value above wich of corelation to considered

  feature=[]
  value=[]
  for i ,index in enumerate(cor_dat.index):
    if abs(cor_dat[index]) > threshold:
      feature.append(index)
      value.append(cor_dat[index])

    df = pd.DataFrame(data = value, index = feature, columns=['corr value'])
  return df

In [None]:
top_corelated_values = get_corelated_col(cormap[73], 0.40)
top_corelated_values

In [None]:
final_df = df[top_corelated_values.index]
final_df.head()

In [None]:
X = new_df.drop([73], axis=1)
y = new_df[73]

In [None]:
# Scale the data to be between -1 and 1

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X.head()

In [None]:
#now lets split data in test train pairs

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [None]:
# model training 

from sklearn.svm import SVC

model= SVC(kernel='rbf')   # Here kernel used is RBF (Radial Basis Function)
model.fit(X_train, y_train)

In [None]:
# Prediction

y_pred = model.predict(X_test)

pred_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
pred_df.head()

In [None]:
from sklearn.metrics import confusion_matrix

mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(mat, annot = True)

In [None]:
from sklearn import metrics

# Measure the Accuracy Score
print("Accuracy score of the predictions: {value:.2f} %".format(value=metrics.accuracy_score(y_pred, y_test)*100))
