In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix
%matplotlib inline

In [None]:
df = pd.read_csv('../data/weatherAUS.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
counts = df['RainTomorrow'].value_counts()
print(counts)

In [None]:
sns.countplot(x = 'RainToday', hue =  'RainTomorrow', orient = 'h', data = df)

In [None]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)

In [None]:
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [None]:
f, ax = plt.subplots(figsize=(15, 6))
plt.xticks(rotation='90')
sns.barplot(x=missing_data.index, y=missing_data['Percent'])
missing_data

In [None]:
df=df.drop(['Sunshine','Evaporation','Cloud3pm','Cloud9am', 'Date','Location','RISK_MM'],axis=1)

In [None]:
df['RainTomorrow']=df['RainTomorrow'].map({'No':0,'Yes':1})

In [None]:
df['RainToday']=df['RainToday'].map({'No':0,'Yes':1})

In [None]:
df=df.dropna(how='any')

In [None]:
df["WindDir9am"].unique()

In [None]:
df["WindDir3pm"].unique()

In [None]:
df["WindGustDir"].unique()

In [None]:
df['WindDir9am']=df['WindDir9am'].astype('category')
df['WindDir9am']=df['WindDir9am'].cat.codes

In [None]:
df['WindDir3pm']=df['WindDir3pm'].map({'WNW':0, 'WSW':1, 'E':2, 'NW':3, 'W':4, 'SSE':5, 'ESE':6, 'ENE':7, 'NNW':8, 'SSW':9,
                                       'SW':10, 'SE':11, 'N':12, 'S':13, 'NNE':14,
                                       'NE':15})

In [None]:
df['WindGustDir']=df['WindGustDir'].map({'W':0, 'WNW':1, 'WSW':2, 'NE':3, 'NNW':4, 'N':5, 'NNE':6, 'SW':7, 'ENE':8, 'SSE':9,
                                         'S':10, 'NW':11, 'SE':12, 'ESE':13,
                                         'E':14, 'SSW':15})

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.to_csv('../data/weather_preprocessed.csv', index = False)

Часть B

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
tmp = df.select_dtypes(include=numerics)
tmp["RainTomorrow"]= df["RainTomorrow"]
# check columns:
tmp.columns

In [None]:
sns.pairplot(tmp, vars = tmp.columns[:4],hue="RainTomorrow")
plt.show()

In [None]:
sns.pairplot(tmp, vars = tmp.columns[4:8],hue="RainTomorrow")
plt.show()

In [None]:
sns.pairplot(tmp, vars = tmp.columns[8:12],hue="RainTomorrow")
plt.show()

Подготовка данных

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
X = df.drop(labels = ['RainTomorrow'],axis = 1)
X.columns

In [None]:
y = df['RainTomorrow']

In [None]:
X = sc.fit_transform(X)

In [None]:
X.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 40)

Обучение модели Catboost

In [None]:
def print_classification_model_metrics(estimator, y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(estimator.score(X_test, y_test))

In [None]:
model = CatBoostClassifier(learning_rate=0.2).fit(X_train, y_train)
print_classification_model_metrics(model, y_test, model.predict(X_test))


## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
train_accuracy = model.score(X_train, y_train)
test_accuracy = model.score(X_test, y_test)
print(train_accuracy)
print(test_accuracy)
y_pred = model.predict(X_test)
print_classification_model_metrics(model, y_test,y_pred)

## SVM

In [None]:
from sklearn import svm
model = svm.SVC(kernel='linear')
model.fit(X_train, y_train)
train_accuracy = model.score(X_train, y_train)
test_accuracy = model.score(X_test, y_test)
print(train_accuracy)
print(test_accuracy)
y_pred = model.predict(X_test)
print_classification_model_metrics(model, y_test,y_pred)

## Decision Tree

In [None]:
from sklearn import tree
model = tree.DecisionTreeClassifier().fit(X_train, y_train)
y_pred = model.predict(X_test)
print_classification_model_metrics(model, y_test,y_pred)


