# Some preprocessing Using scikit-learn

### Import Required Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error

### Load Dataset

In [3]:
data = pd.read_csv('flights.csv')

In [4]:
data.head()

Unnamed: 0,year,month,passengers
0,1949,January,112
1,1949,February,118
2,1949,March,132
3,1949,April,129
4,1949,May,121


In [5]:
data.shape

(144, 3)

In [6]:
data.describe(include='all')

Unnamed: 0,year,month,passengers
count,144.0,144,144.0
unique,,12,
top,,January,
freq,,12,
mean,1954.5,,280.298611
std,3.464102,,119.966317
min,1949.0,,104.0
25%,1951.75,,180.0
50%,1954.5,,265.5
75%,1957.25,,360.5


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   year        144 non-null    int64 
 1   month       144 non-null    object
 2   passengers  144 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 3.5+ KB


### Separate features and target variable

In [9]:
X = data.drop('passengers', axis=1)
y = data['passengers']

### Split Data into Training and Test Sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Identify numerical and categorical columns

In [14]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

### Define preprocessing for numerical features

In [17]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

### Define preprocessing for categorical features

In [24]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

### Combine preprocessing steps

In [30]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

### Define and Train Models

#### Example with Linear Regression

In [34]:
linear_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

In [35]:
linear_pipeline.fit(X_train, y_train)

#### Example with Logistic Regression

In [39]:
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression())
])

In [40]:
logistic_pipeline.fit(X_train, y_train)

### Example with Random Forest

In [41]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

In [42]:
rf_pipeline.fit(X_train, y_train)

### Evaluate Models

#### For Regression Models

In [43]:
y_pred = linear_pipeline.predict(X_test)
y_pred

array([380.47425947, 212.39348502, 251.20676419, 321.79524282,
       255.51662731,  94.80025763, 417.75420028, 296.20955461,
       343.26016971, 214.07859265, 216.1092309 , 375.55556397,
       283.50215845, 418.68142561, 351.73475179, 254.84856899,
       151.51844237, 251.29268241, 457.68652593, 257.20445429,
        84.61625547, 375.27480385, 383.23386861, 425.39113167,
       309.27966781, 186.70189388,  89.81571109,  93.37159767,
       157.16610875])

In [46]:
print("Linear Regression MSE:", mean_squared_error(y_test, y_pred))

Linear Regression MSE: 7017.586206896552


#### For Classification Models

In [45]:
y_pred = logistic_pipeline.predict(X_test)
y_pred

array([461, 148, 180, 318, 404, 196, 461, 315, 199, 148, 348, 199, 180,
       318, 318, 229, 135, 347, 315, 318, 148, 405, 419, 315, 199, 133,
       148, 172, 132], dtype=int64)

In [48]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))

Logistic Regression Accuracy: 0.0


### Hyperparameter Tuning

#### Example with Random Forest and Grid Search

In [51]:
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30]
}

In [53]:
grid_search = GridSearchCV(rf_pipeline,
                           param_grid, 
                           cv=2, 
                           scoring='accuracy'
                          )
grid_search.fit(X_train, y_train)



In [54]:
grid_search.best_params_

{'model__max_depth': None, 'model__n_estimators': 100}

In [55]:
 grid_search.best_score_

0.017392619479733817

#### Example with Logistic Regression and Random Search

In [59]:
param_dist = {
    'model__C': np.logspace(-4, 4, 20),
    'model__solver': ['liblinear', 'saga']
}

In [61]:
random_search = RandomizedSearchCV(logistic_pipeline, 
                                   param_dist, 
                                   n_iter=100, 
                                   cv=2, 
                                   scoring='accuracy', 
                                   random_state=42
                                  )
random_search.fit(X_train, y_train)



In [62]:
random_search.best_params_

{'model__solver': 'saga', 'model__C': 0.0001}

In [63]:
random_search.best_score_

0.03478523895946763