In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
%matplotlib inline
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns',None)

In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
df.head()                          #Reading

In [None]:
#Checking the Datatypes

In [None]:
df.dtypes         #Date datatype is in object,so need to convert it into datetime format

In [None]:
df['Date']=pd.to_datetime(df['Date'],errors='coerce')    #Conversion of date datatype into datetime

In [None]:
c=pd.DataFrame(df.dtypes,columns=['dtype'])                               #Again checking the converted dtypes
c

In [None]:
#Checking for Null values

In [None]:
import missingno as msno
msno.heatmap(df)
plt.show()

In [None]:
msno.bar(df,sort= 'descending')
plt.show()                        #Cloud3pm,sunshine,evaporation have max null values

In [None]:
n=pd.DataFrame(df.isna().sum()*100/len(df),columns=['null_value_per'])
s=pd.concat([c,n],axis=1).T
s

In [None]:
#Evaporation,sunshine and cloud3pm is having null values>40%
#cloud9am is also having null values around 40%
#Date and Location doesnot have Null values
#rest of the columns are having null values equal to or less than 10

In [None]:
#dropping rows for features having null values less than 5%

In [None]:
l=[]
for i in s.columns[2:]:
    if (s.iloc[1][i]<6):
        l.append(i)

In [None]:
df.dropna(subset=l,axis=0,inplace=True)

In [None]:
df.isna().sum()*100/len(df)

In [None]:
df.shape        #rechecking rows after dropping null values

In [None]:
#now Replacing the Null values with median values
for i in ['Evaporation','Sunshine','WindGustSpeed','Pressure9am','Pressure3pm','Cloud9am','Cloud3pm']:
    df[i].fillna(df[i].median(),inplace=True)

In [None]:
for i in ['WindGustDir','WindDir9am','WindDir3pm','RainToday','RainTomorrow']:
    df[i].fillna(df[i].mode(),inplace=True)

In [None]:
#the null values of windgustdir,winddir9am didnt get replaced.so replacing them

In [None]:
for i in ['WindGustDir','WindDir9am']:
    df[i].fillna(df[i].mode()[0],inplace=True)

In [None]:
df.isna().sum()*100/len(df)       #now there are no null values in dataset

In [None]:
#checking Correlation

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),annot=True)
plt.show()

In [None]:
#MinTemp is highly correlated to maxtemp,temp9am,temp3am
#maxtemp is highest correlated to mintemp,temp9am,temp3am
#windgustspeed is correlated to windspeed3pm
#humidity9am is correlated to humidity3pm
#pressure9am is highly correlated to pressure3pm
#temp9am is highly correlated to temp3pm

In [None]:
sns.pairplot(df,diag_kind='kde')
plt.show()

In [None]:
#MinTemp is highly correlated to maxtemp,temp9am,temp3am
#maxtemp is highest correlated to mintemp,temp9am,temp3am
#windgustspeed is correlated to windspeed3pm
#humidity9am is correlated to humidity3pm
#pressure9am is highly correlated to pressure3pm
#temp9am is highly correlated to temp3pm

In [None]:
#mintemp,maxtemp,temp9am,temp3pm,humidity3pm isalmost normal distributed
#evaporation

In [None]:
df.skew()

In [None]:
df['RainTomorrow'].value_counts().plot.bar()
plt.show()                                      #unevenity is more in raintomorrow#more values of no is there

In [None]:
for  i in df.loc[:,df.dtypes==np.object]:
    df[i].value_counts().plot.bar(figsize=(15,8),label=i)
    plt.legend()
    plt.show()

In [None]:
#rain today data is also uneven

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['Date'],df['WindSpeed9am'],label='9amwindspeed')
sns.lineplot(df['Date'],df['WindSpeed3pm'],label='3pmwindspeed')
plt.legend()
plt.ylabel('WindSpeed')
plt.show()

In [None]:
#in 2008 wind speed was more as compared to following year,also windspeed at 9am is more than windspeed at 3pm

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['Date'],df['Humidity9am'],label='9amHumidity')
sns.lineplot(df['Date'],df['Humidity3pm'],label='3pmHumidity')
plt.legend()
plt.ylabel('Humidity')
plt.show()

In [None]:
#humidity at 9am is more than at 3 pm ,also in 2008 humidity was more as compared to following years

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['Date'],df['Pressure9am'],label='9amPressure')
sns.lineplot(df['Date'],df['Pressure3pm'],label='3pmPressure')
plt.legend()
plt.ylabel('Pressure')
plt.show()

In [None]:
#pressure at 9am and 3pm is almost same and it is same for  all the years

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['Date'],df['Cloud9am'],label='9amCloud',linewidth=1)
sns.lineplot(df['Date'],df['Cloud3pm'],label='3pmCloud',linewidth=1)
plt.legend()
plt.ylabel('Cloud')
plt.show()

In [None]:
#cloud at 9am and 3pm is almost same and it is same for  all the years

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(df['Date'],df['Temp9am'],label='9amTemp',linewidth=1)
sns.lineplot(df['Date'],df['Temp3pm'],label='3pmTemp',linewidth=1)
plt.legend()
plt.ylabel('Temp')
plt.show()

In [None]:
#temp at 9am is less than at 3 pm ,also the temp is almost same for all years
#it is seeing a low in mid of the year

In [None]:
#cloud and pressure are same at all times

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(1, 2, 1)
sns.violinplot(df['RainTomorrow'],df['Cloud9am'])
plt.subplot(1, 2, 2)
sns.violinplot(df['RainTomorrow'],df['Cloud3pm'])
plt.show()

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(1, 2, 1)
sns.violinplot(df['RainTomorrow'],df['Pressure9am'])
plt.subplot(1, 2, 2)
sns.violinplot(df['RainTomorrow'],df['Pressure3pm'])
plt.show()

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(1, 2, 1)
sns.barplot(df['RainTomorrow'],df['Cloud9am'])
plt.subplot(1, 2, 2)
sns.barplot(df['RainTomorrow'],df['Cloud3pm'])
plt.show()

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(1, 2, 1)
sns.barplot(df['RainTomorrow'],df['Pressure3pm'])
plt.subplot(1, 2, 2)
sns.barplot(df['RainTomorrow'],df['Pressure9am'])
plt.show()

In [None]:
#when clouds are more, it is more likely to rain in both 9am or 3 pm case

In [None]:
#pressure distribution is almost same for both raining and not raining prediction

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(1, 2, 1)
sns.barplot(df['RainTomorrow'],df['WindSpeed3pm'])
plt.subplot(1, 2, 2)
sns.barplot(df['RainTomorrow'],df['WindSpeed9am'])
plt.show()

In [None]:
#wind speed is more when its likely to rain both in 9am,3pm

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(1, 2, 1)
sns.barplot(df['RainTomorrow'],df['Temp3pm'])
plt.subplot(1, 2, 2)
sns.barplot(df['RainTomorrow'],df['Temp9am'])
plt.show()

In [None]:
#at 9am temp is almost same
#at 3pm temp is low when raining

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(1, 2, 1)
sns.countplot(df['WindDir3pm'],hue=df['RainTomorrow'])
plt.subplot(1, 2, 2)
sns.countplot(df['WindDir9am'],hue=df['RainTomorrow'])
plt.show()

In [None]:
#at 9am wind direction is mostly North when its not gonna rain # is mostly north when its gonna rain
#at 3pm wind direction can be mostly North,west or WN when its gonna rain#at3pm wind direction is mostly SE when its not raining
#but data is not even 
#WindDir9am can be a good feature

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(121)
sns.barplot(df['RainTomorrow'],df['WindGustSpeed'])
plt.subplot(122)
sns.countplot(df['WindGustDir'],hue=df['RainTomorrow'])
plt.show()

In [None]:
#windgustspeed is more when rain is to happen tomorrow
#windgustdir is almost same for both when its to rain or its not to rain
#windgustspeed is more helpful in raintomorrow

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(df['Location'],hue=df['RainTomorrow'])
plt.xticks(rotation=90)
plt.show()                       #countrywise rain tomorrow plot

In [None]:
plt.figure(figsize=(20,8))
plt.subplot(121)
df['RainToday'].value_counts().plot.pie(autopct='%0.2f%%')
plt.subplot(122)
df['RainTomorrow'].value_counts().plot.pie(autopct='%0.2f%%')
plt.show()                                                        

In [None]:
#rain today and rain tomorrow have almost same biased data

In [None]:
from sklearn.preprocessing import LabelEncoder,StandardScaler,PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [None]:
#Doing label encoding of the categorical columns

In [None]:
le=LabelEncoder()
for i in df.loc[:,df.dtypes==np.object]:
    df[i]=le.fit_transform(df[i])

In [None]:
df.head()

In [None]:
X=df.drop(['Date','RainTomorrow'],axis=1)
y=df['RainTomorrow']

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,train_size=0.75,random_state=1)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('lr',LogisticRegression())
))
pipe.fit(xtrain,ytrain)
a=pipe.score(xtest,ytest)
a1=pipe.score(xtrain,ytrain)
print(a,a1)            #it is overfit model

In [None]:
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('rfr',RandomForestClassifier(n_estimators=100))
))
pipe.fit(xtrain,ytrain)
b=pipe.score(xtest,ytest)
b1=pipe.score(xtrain,ytrain)
print(b,b1)            #it is overfit model

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('rfr',AdaBoostClassifier(n_estimators=100))
))
pipe.fit(xtrain,ytrain)
c=pipe.score(xtest,ytest)
c1=pipe.score(xtrain,ytrain)
print(c,c1)

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('rfr',GradientBoostingClassifier(n_estimators=100))
))
pipe.fit(xtrain,ytrain)
d=pipe.score(xtest,ytest)
d1=pipe.score(xtrain,ytrain)
print(d,d1)

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('rfr',BaggingClassifier(n_estimators=100))
))
pipe.fit(xtrain,ytrain)
e=pipe.score(xtest,ytest)
e1=pipe.score(xtrain,ytrain)
print(e,e1)

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('dtc',DecisionTreeClassifier(max_depth=5))
))
pipe.fit(xtrain,ytrain)
f=pipe.score(xtest,ytest)
f1=pipe.score(xtrain,ytrain)
print(f,f1)

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('xg',XGBClassifier(max_depth=7,gamma=2))
))
pipe.fit(xtrain,ytrain)
g=pipe.score(xtest,ytest)
g1=pipe.score(xtrain,ytrain)
print(g,g1)

In [None]:
l=['LogisticRegression','RandomForestClassifier','AdaBoostClassifier','GradientBoostingClassifier','BaggingClassifier','DecisionTreeClassifier','XGBClassifier']
l1=[a,b,c,d,e,f,g]
l2=[a1,b1,c1,d1,e1,f1,g1]
comp=pd.DataFrame({'model_name':l,'test_score':l1,'training_score':l2})
comp.sort_values(by='test_score',ascending=False)

In [None]:
#Cheking XGBoost,BaggingClassifier and RandomForest cross_value_score

In [None]:
#RandomForest cv score
xgb=XGBClassifier(max_depth=7,gamma=2)
bc=BaggingClassifier(n_estimators=100)
rfr=RandomForestClassifier(n_estimators=100)
from sklearn.model_selection import cross_val_score,GridSearchCV
cross1=cross_val_score(estimator=xgb,X=X,y=y,cv=10,scoring='accuracy')
cross1

In [None]:
cross1.mean()

In [None]:
from sklearn.model_selection import cross_val_score,GridSearchCV
cross2=cross_val_score(estimator=rfr,X=X,y=y,cv=10,scoring='accuracy')
cross2

In [None]:
cross2.mean()

In [None]:
cross3=cross_val_score(estimator=bc,X=X,y=y,cv=10,scoring='accuracy')
cross3

In [None]:
cross3.mean()

In [None]:
gb=GradientBoostingClassifier(n_estimators=100)
cross4=cross_val_score(estimator=gb,X=X,y=y,cv=10,scoring='accuracy')
cross4

In [None]:
cross4.mean()

In [None]:
#from here we can choose randomforest for further GridSearchCV
rfr=RandomForestClassifier()
param=[{'n_estimators':[10,100,500,1000],'criterion':['entropy','gini'],'max_depth':[3,5,7,9]}]
grid=GridSearchCV(estimator=rfr,param_grid=param,scoring='accuracy',n_jobs=-1)
grid.fit(xtrain,ytrain)

In [None]:
grid.score(xtest,ytest)

In [None]:
grid.best_estimator_

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('rfr',RandomForestClassifier(n_estimators=500))
))
pipe.fit(xtrain,ytrain)

In [None]:
pipe.score(xtest,ytest)

In [None]:
pipe.score(xtrain,ytrain)          #it is still overfitting

In [None]:
#cross_val and grid search are not improving the model much

In [None]:
#Now analysis the model by removing outliers

In [None]:
df=pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')
df.head()      

In [None]:
df['Date']=pd.to_datetime(df['Date'],errors='coerce') 

In [None]:
c=pd.DataFrame(df.dtypes,columns=['dtype'])                               #Again checking the converted dtypes
n=pd.DataFrame(df.isna().sum()*100/len(df),columns=['null_value_per'])
s=pd.concat([c,n],axis=1).T

In [None]:
l=[]
for i in s.columns[2:]:
    if (s.iloc[1][i]<6):
        l.append(i)

In [None]:
df.dropna(subset=l,axis=0,inplace=True)

In [None]:
df.isna().sum()*100/len(df)

In [None]:
#now Replacing the Null values with median values
for i in ['Evaporation','Sunshine','WindGustSpeed','Pressure9am','Pressure3pm','Cloud9am','Cloud3pm']:
    df[i].fillna(df[i].median(),inplace=True)

In [None]:
for i in ['WindGustDir','WindDir9am','WindDir3pm','RainToday','RainTomorrow']:
    df[i].fillna(df[i].mode(),inplace=True)

In [None]:
#the null values of windgustdir,winddir9am didnt get replaced.so replacing them

In [None]:
for i in ['WindGustDir','WindDir9am']:
    df[i].fillna(df[i].mode()[0],inplace=True)

In [None]:
df.isna().sum()*100/len(df)       #now there are no null values in dataset

In [None]:
for i in df.loc[:,df.dtypes!=np.object][1:]:
    q1=df[i].quantile(0.25)
    q3=df[i].quantile(0.75)
    iqr=q3-q1
    ll=q1-1.5*iqr
    ul=q3+1.5*iqr
    a=df[(df[i]<ll) | (df[i]>ul)].index
    df.drop(a,axis=0,inplace=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
le=LabelEncoder()
for i in df.loc[:,df.dtypes==np.object]:
    df[i]=le.fit_transform(df[i])

In [None]:
df.plot.box(figsize=(20,15))
plt.xticks(rotation=90)
plt.show()

In [None]:
#rebuilding models without outliers

In [None]:
X1=df.drop(['Date','RainTomorrow'],axis=1)
y1=df['RainTomorrow']

In [None]:
xtrain1,xtest1,ytrain1,ytest1=train_test_split(X1,y1,train_size=0.75,random_state=1)

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('lr',LogisticRegression())
))
pipe.fit(xtrain1,ytrain1)
a=pipe.score(xtest1,ytest1)
a1=pipe.score(xtrain1,ytrain1)
print(a,a1)

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('rfr',RandomForestClassifier(n_estimators=100))
))
pipe.fit(xtrain1,ytrain1)
b=pipe.score(xtest1,ytest1)
b1=pipe.score(xtrain1,ytrain1)
print(b,b1)                                       #random forest is giving 89per accuracy#it is still overfit model

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('rfr',AdaBoostClassifier(n_estimators=100))
))
pipe.fit(xtrain1,ytrain1)
c=pipe.score(xtest1,ytest1)
c1=pipe.score(xtrain1,ytrain1)              #adaboost is giving 89 per accuracy plus it is not overfitting
print(c,c1)

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('rfr',GradientBoostingClassifier(n_estimators=100))
))
pipe.fit(xtrain1,ytrain1)
d=pipe.score(xtest1,ytest1)
d1=pipe.score(xtrain1,ytrain1)         #since boosting improves overfitting model so ada and gradient boosting are doing better
print(d,d1)                                           #gradientboost is giving 89 per accuracy plus it is not overfitting

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('rfr',BaggingClassifier(n_estimators=100))
))
pipe.fit(xtrain1,ytrain1)
e=pipe.score(xtest1,ytest1)
e1=pipe.score(xtrain1,ytrain1)
print(e,e1)                                    #its overfit model so no bagging technique will work

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('dtc',DecisionTreeClassifier(max_depth=5))
))
pipe.fit(xtrain1,ytrain1)
f=pipe.score(xtest1,ytest1)
f1=pipe.score(xtrain1,ytrain1)
print(f,f1)

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('xg',XGBClassifier(max_depth=7,gamma=2))
))
pipe.fit(xtrain1,ytrain1)
g=pipe.score(xtest1,ytest1)
g1=pipe.score(xtrain1,ytrain1)
print(g,g1)                               #it also has a varience error but less than bagging models

In [None]:
l=['LogisticRegression','RandomForestClassifier','AdaBoostClassifier','GradientBoostingClassifier','BaggingClassifier','DecisionTreeClassifier','XGBClassifier']
l1=[a,b,c,d,e,f,g]
l2=[a1,b1,c1,d1,e1,f1,g1]
comp=pd.DataFrame({'model_name':l,'test_score':l1,'training_score':l2})
comp.sort_values(by='test_score',ascending=False)

In [None]:
#now feature selection can be done

In [None]:
from yellowbrick.target import FeatureCorrelation

In [None]:
features=list(X1.columns)
visualizer = FeatureCorrelation(labels = features)
visualizer.fit(X1, y1)
visualizer.poof()

In [None]:
#considering columns with highest correlation
Xn=df[['Humidity3pm','RainToday','Pressure9am', 'Pressure3pm','Rainfall','WindGustSpeed','WindSpeed9am', 'WindSpeed3pm','Humidity9am','MaxTemp','MinTemp','Temp3pm','Location']]
yn=df['RainTomorrow']

In [None]:
xtrain11,xtest11,ytrain11,ytest11=train_test_split(Xn,yn,train_size=0.75,random_state=1)

In [None]:
#only Gradient ad adaboosting will be done

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('rfr',AdaBoostClassifier(n_estimators=100))
))
pipe.fit(xtrain11,ytrain11)
a=pipe.score(xtest11,ytest11)
a1=pipe.score(xtrain11,ytrain11)              #adaboost is giving around 89 per accuracy plus it is not overfitting
print(a,a1)

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('rfr',GradientBoostingClassifier(n_estimators=100))
))
pipe.fit(xtrain11,ytrain11)
d=pipe.score(xtest11,ytest11)
d1=pipe.score(xtrain11,ytrain11)        
print(d,d1)                        #gradient boosting is also giving more than 89 per accuracy and not overfitting

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [None]:
gbr=GradientBoostingClassifier()
sfs=SequentialFeatureSelector(estimator=gbr,k_features=15,scoring='accuracy',n_jobs=-1)
sfs.fit(xtrain1,ytrain1)

In [None]:
sfs.k_feature_names_

In [None]:
Xn=df[['Location',
 'Rainfall',
 'Evaporation',
 'WindGustDir',
 'WindGustSpeed',
 'WindDir9am',
 'WindDir3pm',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp3pm',
 'RainToday']]
yn=df['RainTomorrow']

In [None]:
xtrain11,xtest11,ytrain11,ytest11=train_test_split(Xn,yn,train_size=0.75,random_state=1)

In [None]:
pipe=Pipeline((
('ss',StandardScaler()),
('pt',PowerTransformer()),
('rfr',GradientBoostingClassifier(n_estimators=100))
))
pipe.fit(xtrain11,ytrain11)
d=pipe.score(xtest11,ytest11)
d1=pipe.score(xtrain11,ytrain11)        
print(d,d1)

In [None]:
#features selection is also not creating a much of impact