In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sns.set_style('darkgrid')
plt.rcParams['figure.figsize']=[10,8]

In [None]:
df=pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.isnull(),yticklabels=False)

In [None]:
#removing all the null values

for i in df.columns.tolist():
    if df[i].dtype!='object':
        if df[i].isnull().sum()>50000:
            df.dropna(subset=[i],inplace=True)
        elif df[i].isnull().sum()>0:
            df[i].replace(np.nan,df[i].mean(axis=0),inplace=True)
    else:
        df.dropna(subset=[i],inplace=True)
        
df.isnull().sum()

In [None]:
df.shape

In [None]:
#removing the unnecessary columns
df.drop(['Date','Location'],axis=1,inplace=True)

In [None]:
df.shape

In [None]:
df['WindDir3pm'].value_counts().count()

In [None]:
from collections import Counter
Counter(df['RainTomorrow'])

In [None]:
#splitting the independent and dependent data
X=df.drop('RainTomorrow',axis=1)
y=df['RainTomorrow']

In [None]:
#converting the categorical values into numeric values
categorical_features=X.select_dtypes('object').columns.tolist()
categorical_features

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for i in categorical_features:
    X[i]=le.fit_transform(X[i])
X.head()

In [None]:
X.reset_index(drop=True,inplace=True)
X.head()

In [None]:
X.shape

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
etc=ExtraTreesClassifier(n_estimators=200,random_state=0)
etc.fit(X,y)
important_fature=pd.Series(etc.feature_importances_)

In [None]:
important_features=pd.DataFrame({'Feature':X.columns,'Value':important_fature})
important_features.set_index('Feature',inplace=True)
important_features

In [None]:
important_features.nlargest(20,'Value').plot(kind='barh')

In [None]:
#plotting correlation
plt.figure(figsize=(16,14))
sns.heatmap(X.corr(),annot=True,fmt='.2f')

In [None]:
#identifying highly corelated features
corr=X.corr()
features=set()
for i in range(len(corr)):
    for j in range(i):
        if corr.iloc[i,j]>0.6:
            features.add(X.columns.tolist()[i])
features

In [None]:
#Normalization
from sklearn.preprocessing import StandardScaler
X=StandardScaler().fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

In [None]:
Counter(y_train)

In [None]:
Counter(y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(class_weight='balanced')
lr.fit(X_train,y_train)
yhat=lr.predict(X_test)

In [None]:
yhat[0:5]

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,f1_score,precision_score,recall_score
print('Accuracy: ',accuracy_score(y_test,yhat))
print(classification_report(y_test,yhat))
print('F1 Score: ',f1_score(y_test,yhat,pos_label='Yes'))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,fmt='.0f')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [None]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
yhat=knn.predict(X_test)

In [None]:
print('Accuracy: ',accuracy_score(y_test,yhat))
print(classification_report(y_test,yhat))
print('F1 Score: ',f1_score(y_test,yhat,pos_label='Yes'))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,fmt='.0f')

In [None]:
svm=SVC()
svm.fit(X_train,y_train)
yhat=svm.predict(X_test)

In [None]:
print('Accuracy: ',accuracy_score(y_test,yhat))
print(classification_report(y_test,yhat))
print('F1 Score: ',f1_score(y_test,yhat,pos_label='Yes'))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,fmt='.0f')

In [None]:
rf=RandomForestClassifier(n_estimators=250)
rf.fit(X_train,y_train)
yhat=rf.predict(X_test)

In [None]:
print('Accuracy: ',accuracy_score(y_test,yhat))
print(classification_report(y_test,yhat))
print('F1 Score: ',f1_score(y_test,yhat,pos_label='Yes'))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat),annot=True,fmt='.0f')