In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# 1. Load dataset
df = pd.read_csv("/content/chennai_rainfall_2019_2023_mm.csv")
print(df.head())
print(df.info())

# 2. Handle missing values if any
df = df.dropna()

# 3. Features and target (example: predict Rainfall (mm))
X = df.drop(columns=["Rainfall (mm)"], errors="ignore")
y = df["Rainfall (mm)"]

# Convert categorical if needed
X = pd.get_dummies(X, drop_first=True)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 5. Overfitted model (simple LinearRegression without regularization)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

print("\nLinear Regression")
print("Train R2:", r2_score(y_train, y_train_pred))
print("Test R2:", r2_score(y_test, y_test_pred))

# 6. Ridge (L2 regularization)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_test_pred_ridge = ridge.predict(X_test)

print("\nRidge Regression")
print("Test R2:", r2_score(y_test, y_test_pred_ridge))

# 7. Lasso (L1 regularization)
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)
y_test_pred_lasso = lasso.predict(X_test)

print("\nLasso Regression")
print("Test R2:", r2_score(y_test, y_test_pred_lasso))

# 8. Cross-validation
cv_scores = cross_val_score(Ridge(alpha=1.0), X, y, cv=5, scoring='r2')
print("\nCross-validation mean R2:", np.mean(cv_scores))


         Date  Year  Month  Temperature (°C)  Humidity (%)  Wind Speed (km/h)  \
0  2019-01-01  2019      1              31.0          74.5               13.3   
1  2019-01-02  2019      1              29.7          87.1                9.5   
2  2019-01-03  2019      1              31.3          79.5               12.4   
3  2019-01-04  2019      1              33.0          80.1               13.1   
4  2019-01-05  2019      1              29.5          83.5               12.0   

   Rainfall (cm)  Rainfall (mm)  
0            0.5            5.0  
1            0.1            1.0  
2            0.3            3.0  
3            0.1            1.0  
4            1.4           14.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1826 entries, 0 to 1825
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               1826 non-null   object 
 1   Year               1826 non-null   int64  
 2   Month 