In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from unicodedata import category
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets
from ydata_profiling import ProfileReport
import seaborn as sns
import xgboost as xgb
from category_encoders import TargetEncoder
from sklearn import datasets, linear_model, metrics, model_selection, svm
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict
from catboost import CatBoostClassifier
#from ydata_profiling.model.dataframe import preprocess

ModuleNotFoundError: No module named 'ydata_profiling'

## Data overview

### Data load

In [None]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df.head()

### Target distribution

In [None]:
sns.displot(df['rainfall'])

### Features

#### Nan features

In [None]:
sns.heatmap(df.isna().transpose())

#### Num features

In [None]:
df_num_features = df.select_dtypes(include=['float64'])
df_num_features.head()

In [None]:
df_num_features.hist(figsize=(16, 16), bins=20)

### Correlations

In [None]:
fig = sns.jointplot(df, x='maxtemp', y='temparature', kind='scatter')


In [None]:
sns.kdeplot(data=df, x='cloud', hue='rainfall', fill=True)

### Outliers


In [None]:
sns.boxplot(df, x='pressure')

In [None]:
plt.figure(figsize=(10, 5))
matrix = sns.heatmap(np.round(df.corr(), 2), annot=True, linewidths=.5)


## Model Building

## Features

In [None]:
mean_val = df_test['winddirection'].mean()
df_test['winddirection'] = df_test['winddirection'].fillna(mean_val)

df_linear = df.copy()
df_linear_test = df_test.copy()
df_linear['windspeed'] = np.log(df['windspeed'])
df_linear_test['windspeed'] = np.log(df['windspeed'])
df_linear['temp_rate'] = df_linear['temparature'] -(df_linear['maxtemp'] - df_linear['mintemp']) / 2
df_linear_test['temp_rate'] = df_linear_test['temparature'] - (df_linear_test['maxtemp'] - df_linear_test['mintemp']) / 2
df_linear.drop(['mintemp'], axis=1, inplace=True)
df_linear.drop(['maxtemp'], axis=1, inplace=True)
df_linear.drop(['temparature'], axis=1, inplace=True)
df_linear_test.drop(['mintemp'], axis=1, inplace=True)
df_linear_test.drop(['maxtemp'], axis=1, inplace=True)
df_linear_test.drop(['temparature'], axis=1, inplace=True)
num_features = df_linear.select_dtypes(include=['float64']).columns
df_boost = df.copy()
df_boost_test = df_test.copy()
df_test.info()

#### Training sample creation

In [None]:
X_train = df_linear[num_features]
y_train = df_linear['rainfall']
X_test = df_linear_test[num_features]


In [None]:
def dropper(X):
    return X.drop(columns=['maxtemp'])

### Logistic Regression

In [None]:

preprocessor_logreg = ColumnTransformer([('scaler', StandardScaler(), num_features)])

In [None]:
pipeline_logreg = Pipeline([('preprocessor', preprocessor_logreg), ('classifier', LogisticRegression(max_iter=1000, penalty='l2'))])

In [None]:
pipeline_logreg.fit(X_train, y_train)

In [None]:
print(cross_val_score(pipeline_logreg, X_train, y_train, cv=5, scoring='roc_auc').mean())

### CatBoost

In [None]:

X_train = df_boost.iloc[:, 1:-1]

y_train = df_boost['rainfall']
X_test = df_boost_test.iloc[:, 1:]
X_train.head()


In [None]:
from sklearn.ensemble import RandomForestClassifier
pipeline_boost = Pipeline([('classifier', CatBoostClassifier(depth=3, iterations=1000, verbose=0))])


In [None]:
pipeline_boost.fit(X_train, y_train)

In [None]:
print(cross_val_score(pipeline_boost, X_train, y_train, cv=5, scoring='roc_auc').mean())

## Submission

In [None]:
X_test = df_linear_test[num_features]
df_submission = pd.read_csv('sample_submission.csv', index_col=False)
df_submission['rainfall'] = pipeline_logreg.predict_proba(X_test)[:, 1]
df_submission.columns = ['id', 'rainfall']
df_submission.to_csv('submission.csv', index=False)