In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter('ignore')

# Importing data

In [None]:
filepath = '../input/weather-dataset-rattle-package/weatherAUS.csv'
df = pd.read_csv(filepath, index_col='Date')

df.head()

In [None]:
df.info()

# Null values

In [None]:
null_cols = []

for col in df:
    null_values = df[col].isnull().sum()
    print(f"Null values in {col}: {null_values}")
    if null_values > 30000:
        null_cols.append(col)
    
print(f"\n{null_cols}")
df.drop(null_cols, axis=1, inplace=True)

# Preprocessing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

numerical_cols = [col for col in df if df[col].dtype in ['int64', 'float64']]
categorical_cols = [col for col in df if df[col].dtype == 'object']

columns = df.columns
imputer = SimpleImputer(strategy='most_frequent')
df = pd.DataFrame(imputer.fit_transform(df))
df.columns = columns

encoder = LabelEncoder()
for col in df:
    if col in categorical_cols:
        df[col] = encoder.fit_transform(df[col])
        df[col].astype('int64')
    elif col in numerical_cols:
        df[col] = df[col].infer_objects()
    
df.info()

# Models

**Mean Absolute Error**

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

def get_mae(model, X_train, X_test, y_train, y_test):
    """A function that returns the Mean Absolute Error"""
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mae = mean_absolute_error(preds, y_test)
    return mae

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

model_1 = RandomForestClassifier(n_estimators=50, random_state=0)
model_2 = RandomForestClassifier(n_estimators=100, random_state=0)
model_3 = RandomForestClassifier(n_estimators=150, random_state=0)
model_4 = XGBClassifier(n_estimators=100, learning_rate=0.05)
model_5 = XGBClassifier(n_estimators=300, learning_rate=0.05)
model_6 = XGBClassifier(n_estimators=600, learning_rate=0.05)

models = [model_1, model_2, model_3, model_4, model_5, model_6]

# Tests

In [None]:
from sklearn.model_selection import train_test_split

y = df['RainTomorrow']
features = ['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'WindGustDir', 
            'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 
            'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 
            'Pressure3pm', 'Temp9am', 'Temp3pm', 'RainToday']
X = df[features]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

i = 1
for model in models:
    score = get_mae(model, X_train, X_test, y_train, y_test)
    print(f"Score for model {i}: {score}")
    i += 1

# Final model

In [None]:
from sklearn.metrics import classification_report

model = XGBClassifier(n_estimators=600, learning_rate=0.05)

model.fit(X_train, y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))

output = pd.DataFrame({'Rarin_data': y_test.iloc[:], 'Rain_preds': preds})
output.to_csv('output.csv', index=False)