# Dimensionality Reduction Techniques

In [60]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

In [61]:
data = pd.read_csv('flights.csv')

In [62]:
data

Unnamed: 0,year,month,passengers
0,1949,January,112
1,1949,February,118
2,1949,March,132
3,1949,April,129
4,1949,May,121
...,...,...,...
139,1960,August,606
140,1960,September,508
141,1960,October,461
142,1960,November,390


### Separate features and target variable

In [63]:
X = data.drop('passengers', axis=1)
y = data['passengers']

### Split the dataset into training and test sets

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Identify numerical and categorical columns

In [65]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

### Define preprocessing for numerical features

In [66]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

### Define preprocessing for categorical features

In [67]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) 
])

In [68]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

### Principal Component Analysis (PCA)

In [69]:
pca_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=2)),  
    ('model', LinearRegression())
])

In [70]:
pca_pipeline.fit(X_train, y_train)

In [71]:
y_pred_pca = pca_pipeline.predict(X_test)
pca_mse = mean_squared_error(y_test, y_pred_pca)

In [72]:
y_pred_pca

array([397.48194491, 132.21928231, 294.28548165, 359.8917792 ,
       256.9135928 , 130.08641868, 459.09393424, 261.49559955,
       265.16849975, 133.56549353, 230.61684217, 298.06925131,
       327.18623321, 458.59403386, 363.5646794 , 264.86242474,
       164.81533906, 265.87893869, 425.99935732, 294.09027608,
        98.09975154, 394.20651554, 393.59712593, 393.09860577,
       230.92153698, 200.07743557, 101.37518091, 100.35866696,
       163.29186504])

In [73]:
pca_mse

1524.8014484028436

### Linear Discriminant Analysis (LDA):

#### Convert target variable to categorical

##### LDA is generally used for classification problems, not regression. Since our target is continuous, So we did this step

In [88]:
y_train_class = pd.qcut(y_train, q=2, labels=False)
y_test_class = pd.qcut(y_test, q=2, labels=False)

### LDA pipeline

In [89]:
lda_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lda', LDA(n_components=1)),  # Reduce to 1 component
    ('model', LinearRegression())
])

In [90]:
lda_pipeline.fit(X_train, y_train_class)

In [91]:
y_pred_lda = lda_pipeline.predict(X_test)
lda_mse = mean_squared_error(y_test_class, y_pred_lda)

In [92]:
y_pred_lda

array([ 0.98163844,  0.06735443,  0.48683371,  0.7606103 ,  0.34246666,
       -0.14581187,  1.09466965,  0.48579383,  0.4315074 , -0.0646852 ,
        0.2519037 ,  0.55555556,  0.61088186,  1.13275475,  0.81743499,
        0.44529053,  0.00380739,  0.48544583,  1.10603459,  0.512514  ,
       -0.20707446,  0.9093589 ,  0.83416853,  0.98198643,  0.43949888,
        0.23734953, -0.13479493, -0.17495022, -0.03416853])

In [93]:
lda_mse

0.09341569535404194

### SelectKBest:

In [95]:
selectkbest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selectkbest', SelectKBest(score_func=f_regression, k=2)),
    ('model', LinearRegression())
])

In [96]:
selectkbest_pipeline.fit(X_train, y_train)

In [97]:
y_pred_selectkbest = selectkbest_pipeline.predict(X_test)
selectkbest_mse = mean_squared_error(y_test, y_pred_selectkbest)

In [98]:
y_pred_selectkbest

array([388.93705846, 210.84850738, 291.1420628 , 356.33872658,
       225.94539903, 128.15040337, 454.13372224, 258.54373092,
       258.54373092, 128.15040337, 225.94539903, 291.1420628 ,
       323.74039469, 454.13372224, 356.33872658, 258.54373092,
       160.74873526, 258.54373092, 421.53539035, 291.1420628 ,
        95.55207148, 388.93705846, 388.93705846, 388.93705846,
       308.64350304, 193.34706714,  95.55207148,  95.55207148,
       160.74873526])

In [99]:
selectkbest_mse

1460.730979878187

### Evaluate Model Performance

In [100]:
print("PCA MSE:", pca_mse)

PCA MSE: 1524.8014484028436


In [101]:
print("SelectKBest MSE:", selectkbest_mse)

SelectKBest MSE: 1460.730979878187


In [102]:
print("LDA MSE:", lda_mse)

LDA MSE: 0.09341569535404194
