Объявляем библиотеки

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Загружаем данные

In [None]:
df = pd.read_csv('/kaggle/input/did-it-rain-in-seattle-19482017/seattleWeather_1948-2017.csv')

In [None]:
print(df.shape)
df.head()

In [None]:
df['RAIN'].value_counts()

# Подготовка данных для обучения
Проверяем и удаляем нулевые значения

In [None]:
for column in df:
    print(column)
    print(df[column].isnull().sum().sum())

In [None]:
df.dropna(inplace=True)

Корреляция – это взаимозависимость случайных факторов. Она отображает приближенную взаимосвязь и не дает точных ответов.

In [None]:
corr = df.corr()
sns.heatmap(data=df.corr(),annot=True,cmap="coolwarm",linewidths=1,fmt=".2f",linecolor="gray")

Корреляция с целевой переменной

In [None]:
sns.pairplot(df, palette="husl",hue='RAIN')

In [None]:
sns.displot(data=df,palette="husl", x='TMIN', y='TMAX', hue='RAIN', kind='kde')

Преобразуем логическую колонку RAIN в значения 0 или 1

In [None]:
rain = pd.get_dummies(df['RAIN'], drop_first=True)

In [None]:
df = df.drop('RAIN', axis=1)
df = pd.concat([df, rain], axis=1)
print(df.shape)
df.head()

In [None]:
df.columns = ['DATE', 'PRCP', 'TMAX', 'TMIN', 'RAIN']
df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics

In [None]:
x.shape, y.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

#  1. Логистическая регрессия

In [None]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(x_train, y_train)
prediction_lr = logistic.predict(x_test)
print(classification_report(y_test,prediction_lr))

In [None]:
plt.figure(figsize=(10, 10))
y_pred_proba = logistic.predict_proba(x_train)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_train,  y_pred_proba)
auc = metrics.roc_auc_score(y_train, y_pred_proba)
plt.plot(fpr,tpr,label="SVM train, auc="+str(auc))

y_pred_proba = logistic.predict_proba(x_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="SVM test, auc="+str(auc))
plt.legend(loc=4)
plt.show()

# 2. Случайный лес

In [None]:
from sklearn.ensemble import RandomForestClassifier
rforest = RandomForestClassifier()
rforest.fit(x_train, y_train)
prediction_rf = rforest.predict(x_test)
print(classification_report(y_test, prediction_rf))

In [None]:
plt.figure(figsize=(10, 10))
y_pred_proba = forest.predict_proba(x_train)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_train,  y_pred_proba)
auc = metrics.roc_auc_score(y_train, y_pred_proba)
plt.plot(fpr,tpr,label="SVM train, auc="+str(auc))

y_pred_proba = forest.predict_proba(x_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="SVM test, auc="+str(auc))
plt.legend(loc=4)
plt.show()

# 3. KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 4)
knn.fit(x_train, y_train)
y_pred4 = knn.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred4))
#acc_knn = round(knn.score(x_train, y_train)*100 ,2)
#print(acc_knn)

plt.figure(figsize=(10, 10))
y_pred_proba = knn.predict_proba(x_train)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_train,  y_pred_proba)
auc = metrics.roc_auc_score(y_train, y_pred_proba)
plt.plot(fpr,tpr,label="SVM train, auc="+str(auc))

y_pred_proba = knn.predict_proba(x_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="SVM test, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
df.dropna(inplace=True)

In [None]:
ds2 = "target_class"

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
y_test.mean()

In [None]:
x_train.head()

In [None]:
y_test.head()

# 4. RNN

In [None]:
from keras.models import Sequential
from keras.layers import Flatten, Dense
from keras.layers import Embedding, SimpleRNN

model = Sequential()
model.add(Embedding(2000, 8, input_length=20))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()
history = model.fit(x_train, y_train,
                    epochs=30,
                    batch_size=16,
                    validation_split=0.2)

In [None]:
plt.figure(figsize=(10, 10))
y_pred_proba = model.predict_proba(x_train)#[::,1]
fpr, tpr, _ = metrics.roc_curve(y_train,  y_pred_proba)
auc = metrics.roc_auc_score(y_train, y_pred_proba)
plt.plot(fpr,tpr,label="SVM train, auc="+str(auc))

y_pred_proba = model.predict_proba(x_test)#[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="SVM test, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
print(history.history.keys())

In [None]:
result=model.evaluate(x_test, y_test)
print("test loss:{}\ntest accuracy:{}".format(result[0],result[1])) 

# 5. ANN

In [None]:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import Dropout

In [None]:
modelAnn = Sequential([
    Dense(32, activation='relu', input_dim=3),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])
modelAnn.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
modelAnn.summary()

In [None]:
history = modelAnn.fit(x_train, y_train, batch_size=10, epochs=10,verbose=2)

In [None]:
print(history.history.keys())

In [None]:
prediction_nn = modelAnn.predict(x_test)
prediction_nn = [1 if y>=0.5 else 0 for y in prediction_nn]
print(classification_report(y_test, prediction_nn))

In [None]:
plt.figure(figsize=(10, 10))
y_pred_proba = modelAnn.predict_proba(x_train)#[::,1]
fpr, tpr, _ = metrics.roc_curve(y_train,  y_pred_proba)
auc = metrics.roc_auc_score(y_train, y_pred_proba)
plt.plot(fpr,tpr,label="SVM train, auc="+str(auc))

y_pred_proba = modelAnn.predict_proba(x_test)#[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="SVM test, auc="+str(auc))
plt.legend(loc=4)
plt.show()