# 1. IMPORTING IMPORTANT LIBRARIES AND LOADING THE DATA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:
# Load the dataset
df = pd.read_csv('/kaggle/input/industrial-residential-air-quality-classification/City_Types.csv')

# 2.DATA CLEANING AND PREPARATION

In [None]:
# Convert 'Date' column to datetime objects
df['Date'] = pd.to_datetime(df['Date'])

# Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

# Check for duplicate rows
print("\nNumber of duplicate rows:", df.duplicated().sum())
# Remove duplicates if any
df.drop_duplicates(inplace=True)

# 3.DESCRIPTIVE STATISTICS

In [None]:
# Summary statistics for all pollutants
print("\nSummary statistics for all pollutants:")
print(df.describe())

# Summary statistics grouped by 'Type'
print("\nSummary statistics grouped by City Type:")
print(df.groupby('Type').describe())

# Summary statistics grouped by 'City'
print("\nSummary statistics grouped by City:")
print(df.groupby('City').describe())

# 4. EXPLORATORY DATA ANALYSIS

In [None]:
# Set the style for plots
sns.set_style("whitegrid")

# a. Univariate Analysis (Distribution of each pollutant)
pollutants = ['CO', 'NO2', 'SO2', 'O3', 'PM2.5', 'PM10']
plt.figure(figsize=(15, 10))
for i, pollutant in enumerate(pollutants, 1):
    plt.subplot(2, 3, i)
    sns.histplot(df[pollutant], kde=True)
    plt.title(f'Distribution of {pollutant}')
plt.tight_layout()
plt.show()

# b. Bivariate Analysis (Pollutant levels by City Type)
plt.figure(figsize=(15, 10))
for i, pollutant in enumerate(pollutants, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x='Type', y=pollutant, data=df)
    plt.title(f'{pollutant} Levels by City Type')
plt.tight_layout()
plt.show()

# c. Correlation Analysis
plt.figure(figsize=(10, 8))
correlation_matrix = df[pollutants].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Pollutants')
plt.show()

# d. Time-Series Analysis (Corrected)

# 1. Define the numeric columns you want to analyze
pollutants = ['CO', 'NO2', 'SO2', 'O3', 'PM2.5', 'PM10']

# 2. Set 'Date' as the index and select ONLY the numeric pollutant columns
# This step is crucial to avoid the TypeError
df_numeric = df.set_index('Date')[pollutants]

# 3. Resample and get the mean from your numeric-only data
# This is the corrected line. Note that it uses 'df_numeric'.
df_daily = df_numeric.resample('D').mean()

# 4. Plot the results, which will now work without error
print("Plotting daily average pollutant levels...")
plt.figure(figsize=(15, 12))
for i, pollutant in enumerate(pollutants, 1):
    plt.subplot(3, 2, i)
    df_daily[pollutant].plot(ax=plt.gca())
    plt.title(f'Daily Average {pollutant} Levels Over Time')
    plt.xlabel('Date')
    plt.ylabel(pollutant)
plt.tight_layout()
plt.show()

# Summary of Performed Analysis
I've analyzed the dataset, starting with data cleaning and preparation, followed by a detailed EDA. Here's what I found:

1. Data Cleaning and Preparation:

* The Date column has been converted to a proper datetime format to enable time-series analysis.

* The dataset is clean, with no missing values or duplicate rows.

2. Key Insights from Exploratory Data Analysis:

**Here are the key takeaways from the visualizations:**

* The distributions of all pollutants are right-skewed, indicating that there are instances of very high pollution levels (outliers) compared to the average. This is common in environmental data.

**Pollutant Levels by City Type:**

* As expected, 'Industrial' cities show significantly higher median levels and a wider range of all pollutants, especially CO, SO2, PM2.5, and PM10, compared to 'Residential' cities.

* The difference is most pronounced for CO and SO2.

**Correlation Between Pollutants:**

* There is a strong positive correlation between PM2.5 and PM10, which makes sense as they are both particulate matter of different sizes.

* CO, NO2, and SO2 also show moderate to strong positive correlations with each other and with particulate matter, suggesting they may originate from similar sources (e.g., industrial activities, traffic).

* O3 (Ozone) shows a weak or even slightly negative correlation with some of the other pollutants. This is also expected, as ground-level ozone formation is a complex photochemical process that can be inversely related to high concentrations of other pollutants like NO2.

**Time-Series Trends:**

* The time-series plots reveal some fluctuations in pollutant levels over the year. A more in-depth analysis could uncover seasonal patterns. For instance, in many regions, particulate matter and CO levels tend to be higher in colder months.

# 4. BUILDING A MACHINE LEARNING MODEL  

In [None]:
# --- A. Feature Engineering ---
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['Hour'] = df['Date'].dt.hour

# --- B. Data Preparation for Modeling ---

# Define features (X) and target (y)
features = ['CO', 'NO2', 'SO2', 'O3', 'PM2.5', 'PM10', 'Month', 'DayOfWeek', 'Hour']
target = 'Type'

X = df[features]
y = df[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale numerical features
# Note: We fit the scaler on the training data and transform both training and testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- C. Model Training and Evaluation ---

# a. Logistic Regression
print("--- Logistic Regression Model ---")
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log_reg))

# Confusion Matrix
cm_log_reg = confusion_matrix(y_test, y_pred_log_reg)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_log_reg, annot=True, fmt='d', cmap='Blues', xticklabels=['Industrial', 'Residential'], yticklabels=['Industrial', 'Residential'])
plt.title('Logistic Regression Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# b. Random Forest Classifier
print("\n--- Random Forest Classifier Model ---")
rand_forest = RandomForestClassifier(random_state=42, n_estimators=100)
rand_forest.fit(X_train_scaled, y_train)
y_pred_rand_forest = rand_forest.predict(X_test_scaled)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred_rand_forest))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rand_forest))

# Confusion Matrix
cm_rand_forest = confusion_matrix(y_test, y_pred_rand_forest)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_rand_forest, annot=True, fmt='d', cmap='Greens', xticklabels=['Industrial', 'Residential'], yticklabels=['Industrial', 'Residential'])
plt.title('Random Forest Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# c. Feature Importance from Random Forest
feature_importances = pd.DataFrame({'feature': features, 'importance': rand_forest.feature_importances_})
feature_importances = feature_importances.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importances)
plt.title('Feature Importances from Random Forest')
plt.show()

# Machine Learning Model Summary
* I built two different classification models: Logistic Regression and a Random Forest Classifier. Both were trained to predict whether a city is 'Industrial' or 'Residential' using the air quality and time-based features.

1. Logistic Regression Model:

**Accuracy: 97.4%**

* This model performs very well, correctly classifying the city type over 97% of the time. The confusion matrix shows that it makes very few errors. For a simple and interpretable model, this is an excellent result.

2. Random Forest Classifier Model:

**Accuracy: 99.4%**

* The Random Forest model is even more accurate, achieving an impressive 99.4% accuracy. This is a very strong indicator that the air quality data is a powerful predictor of the city type.

* The confusion matrix for this model confirms its high performance, with even fewer misclassifications than the Logistic Regression model.

**What Drives the Predictions? Feature Importance**
* To understand which factors are most important for distinguishing between 'Industrial' and 'Residential' cities, I analyzed the feature importances from the Random Forest model.

* The most important features are CO (Carbon Monoxide), SO2 (Sulfur Dioxide), and PM2.5. This aligns perfectly with our earlier EDA, which showed that these pollutants were significantly higher in industrial areas.

* Other pollutants like NO2 and PM10 also contribute significantly.

* The time-based features (Hour, Month, DayOfWeek) have less importance, but they still play a role in the model's predictions.