<a href="https://colab.research.google.com/github/samirarnj/multimodal/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import os

# --- 1. Data Loading and Exploration ---

# Check if the file exists
file_path = 'rainfall_data.csv'  # Replace with the actual path to your dataset

# If the file doesn't exist, check if running in Google Colab and ask for upload
try:
    df = pd.read_csv(file_path)  # Ensure this path is correct!
    print("File loaded successfully!")
except FileNotFoundError:
    print("Error: rainfall_data.csv not found. Please provide the correct file path.")
    print("Current working directory:", os.getcwd())  # print current working directory
    # If running in Google Colab, prompt user to upload the file
    try:
        from google.colab import files
        uploaded = files.upload()  # Prompt to upload file if running in Colab
        df = pd.read_csv(next(iter(uploaded)))  # Read the uploaded file
        print("File loaded successfully from upload!")
    except ImportError:
        print("Google Colab module not found. Please upload the file manually.")
    except Exception as e:
        print(f"An error occurred: {e}")
        exit()

# Initial Data Overview
print("\nInitial Data Overview:")
print(df.head())
print("\nData Info:")
print(df.info())
print("\nDescriptive Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Explore target variable distribution
print("\nTarget Variable Distribution:")
print(df['RainTomorrow'].value_counts())
sns.countplot(x='RainTomorrow', data=df)
plt.title('Rain Tomorrow Distribution')
plt.show()

# Explore correlations
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# --- 2. Data Preprocessing ---

# Assuming 'Date' column is present, convert it to datetime
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    df['Month'] = df['Date'].dt.month  # Extract month
    df['Year'] = df['Date'].dt.year  # Extract year
    df.drop('Date', axis=1, inplace=True)

# Drop rows with missing target values
df.dropna(subset=['RainTomorrow'], inplace=True)

# Separate features and target
X = df.drop('RainTomorrow', axis=1)
y = df['RainTomorrow'].map({'No': 0, 'Yes': 1})  # Convert target to binary

# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Define transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# --- 3. Model Training and Evaluation ---

# Logistic Regression Pipeline
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

# Parameter grid for Logistic Regression
param_grid_lr = {
    'classifier__solver': ['liblinear'],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [0.1, 1, 10],
    'classifier__class_weight': [None, 'balanced']
}

# GridSearchCV for Logistic Regression
grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='accuracy', verbose=2)
grid_search_lr.fit(X_train, y_train)

# Make predictions and evaluate Logistic Regression
y_pred_lr = grid_search_lr.predict(X_test)
print("\nLogistic Regression Results:")
print("Best Parameters:", grid_search_lr.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

# Confusion Matrix for Logistic Regression
cm_lr = confusion_matrix(y_test, y_pred_lr)
disp_lr = ConfusionMatrixDisplay(confusion_matrix=cm_lr)
disp_lr.plot(cmap='Blues')
plt.title('Logistic Regression Confusion Matrix')
plt.show()

# Random Forest Pipeline
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Parameter grid for Random Forest
param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

# GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='accuracy', verbose=2)
grid_search_rf.fit(X_train, y_train)

# Make predictions and evaluate Random Forest
y_pred_rf = grid_search_rf.predict(X_test)
print("\nRandom Forest Results:")
print("Best Parameters:", grid_search_rf.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Confusion Matrix for Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf)
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf)
disp_rf.plot(cmap='Blues')
plt.title('Random Forest Confusion Matrix')
plt.show()

# --- 4. Feature Importance (Random Forest) ---
if isinstance(grid_search_rf.best_estimator_.named_steps['classifier'], RandomForestClassifier) and len(categorical_features) > 0:
    feature_importances = grid_search_rf.best_estimator_['classifier'].feature_importances_
    feature_names = numerical_features + list(grid_search_rf.best_estimator_['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features))
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False).head(20)
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
    plt.gca().invert_yaxis()
    plt.title('Top 20 Feature Importances (Random Forest)')
    plt.xlabel('Importance Score')
    plt.show()


Error: rainfall_data.csv not found. Please provide the correct file path.
Current working directory: /content
