In [8]:
import pandas as pd

In [9]:
df = pd.read_csv("/content/chennai_rainfall_2019_2023_mm.csv")

In [10]:
df

Unnamed: 0,Date,Year,Month,Temperature (°C),Humidity (%),Wind Speed (km/h),Rainfall (cm),Rainfall (mm)
0,2019-01-01,2019,1,31.0,74.5,13.3,0.5,5.0
1,2019-01-02,2019,1,29.7,87.1,9.5,0.1,1.0
2,2019-01-03,2019,1,31.3,79.5,12.4,0.3,3.0
3,2019-01-04,2019,1,33.0,80.1,13.1,0.1,1.0
4,2019-01-05,2019,1,29.5,83.5,12.0,1.4,14.0
...,...,...,...,...,...,...,...,...
1821,2023-12-27,2023,12,30.2,81.1,10.5,0.1,1.0
1822,2023-12-28,2023,12,29.6,83.9,11.5,0.4,4.0
1823,2023-12-29,2023,12,28.7,77.8,16.9,0.1,1.0
1824,2023-12-30,2023,12,30.3,82.0,11.5,0.0,0.0


In [11]:
df.isnull().sum()

Unnamed: 0,0
Date,0
Year,0
Month,0
Temperature (°C),0
Humidity (%),0
Wind Speed (km/h),0
Rainfall (cm),0
Rainfall (mm),0


In [12]:
df = df.drop(['Date', 'Rainfall (cm)', 'Year', 'Month'], axis=1)
print(df.columns)

Index(['Temperature (°C)', 'Humidity (%)', 'Wind Speed (km/h)',
       'Rainfall (mm)'],
      dtype='object')


In [13]:
numeric_columns = df.select_dtypes(include=['number','float'])
numeric_columns

Unnamed: 0,Temperature (°C),Humidity (%),Wind Speed (km/h),Rainfall (mm)
0,31.0,74.5,13.3,5.0
1,29.7,87.1,9.5,1.0
2,31.3,79.5,12.4,3.0
3,33.0,80.1,13.1,1.0
4,29.5,83.5,12.0,14.0
...,...,...,...,...
1821,30.2,81.1,10.5,1.0
1822,29.6,83.9,11.5,4.0
1823,28.7,77.8,16.9,1.0
1824,30.3,82.0,11.5,0.0


In [14]:
outlier_summary = {}

for col in numeric_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

    if outliers.shape[0] > 0:
        outlier_summary[col] = "Yes"
    else:
        outlier_summary[col] = "No"

# Show summary
for col, status in outlier_summary.items():
    print(f"{col}: Outliers present? {status}")


Temperature (°C): Outliers present? Yes
Humidity (%): Outliers present? Yes
Wind Speed (km/h): Outliers present? Yes
Rainfall (mm): Outliers present? Yes


In [15]:
df['Temperature (°C)'] = df['Temperature (°C)'].clip(lower_bound, upper_bound)
df['Humidity (%)'] = df['Humidity (%)'].clip(lower_bound, upper_bound)
df['Wind Speed (km/h)'] = df['Wind Speed (km/h)'].clip(lower_bound, upper_bound)
df['Rainfall (cm)'] = df['Rainfall (mm)'].clip(lower_bound, upper_bound)

In [16]:
X = df.drop('Rainfall (mm)' ,axis=1)
y = df['Rainfall (mm)']

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Initialize the scaler
scaler = StandardScaler()

# Fit scaler on training data and transform
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data
X_test_scaled = scaler.transform(X_test)

# Create and train the Linear Regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = lin_reg.predict(X_test_scaled)

# Evaluate
score = r2_score(y_test, y_pred)
print("R² Score:", score)


R² Score: 0.8853479959643954


In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Create the model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Evaluate with R² score
score = r2_score(y_test, y_pred)
print("R² Score (Random Forest):", score)


R² Score (Random Forest): 0.9194650048871367


In [20]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

# Create the model
dt = DecisionTreeRegressor(random_state=42)

# Train the model
dt.fit(X_train, y_train)

# Predict
y_pred = dt.predict(X_test)

# Evaluate
score = r2_score(y_test, y_pred)
print("R² Score (Decision Tree):", score)


R² Score (Decision Tree): 0.8812227122786528


In [21]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

# Create the model (you can tune n_neighbors)
knn = KNeighborsRegressor(n_neighbors=5)

# Train the model
knn.fit(X_train, y_train)

# Predict
y_pred = knn.predict(X_test)

# Evaluate with R² score
score = r2_score(y_test, y_pred)
print("R² Score (KNN):", score)


R² Score (KNN): 0.915845491072637
