# 🔴 Task 30-> Some preprocessing Using scikit-learn

### Objective:- Preprocessing is a crucial step in preparing your data for machine learning models.

#### Import the required Libraries

In [46]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### Import the csv file as a Data Frame

In [47]:
titanic_df = pd.read_csv('titanic.csv')

#### Now, Preprocess first the columns with numerical data

In [48]:
titanic_df['Survived'] = titanic_df['Survived'].map({0: 0, 1: 1})
num_cols = titanic_df.select_dtypes(include=['int64', 'float64']).columns
imputer = SimpleImputer(strategy='mean')
titanic_df[num_cols] = imputer.fit_transform(titanic_df[num_cols])
scaler = StandardScaler()
titanic_df[num_cols] = scaler.fit_transform(titanic_df[num_cols])
display(titanic_df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,-1.730108,-0.789272,0.827377,"Braund, Mr. Owen Harris",male,-0.592481,0.432793,-0.473674,A/5 21171,-0.502445,,S
1,-1.726220,1.266990,-1.566107,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,0.638789,0.432793,-0.473674,PC 17599,0.786845,C85,C
2,-1.722332,1.266990,0.827377,"Heikkinen, Miss. Laina",female,-0.284663,-0.474545,-0.473674,STON/O2. 3101282,-0.488854,,S
3,-1.718444,1.266990,-1.566107,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,0.407926,0.432793,-0.473674,113803,0.420730,C123,S
4,-1.714556,-0.789272,0.827377,"Allen, Mr. William Henry",male,0.407926,-0.474545,-0.473674,373450,-0.486337,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,1.714556,-0.789272,-0.369365,"Montvila, Rev. Juozas",male,-0.207709,-0.474545,-0.473674,211536,-0.386671,,S
887,1.718444,1.266990,-1.566107,"Graham, Miss. Margaret Edith",female,-0.823344,-0.474545,-0.473674,112053,-0.044381,B42,S
888,1.722332,-0.789272,0.827377,"Johnston, Miss. Catherine Helen ""Carrie""",female,0.000000,0.432793,2.008933,W./C. 6607,-0.176263,,S
889,1.726220,1.266990,-1.566107,"Behr, Mr. Karl Howell",male,-0.284663,-0.474545,-0.473674,111369,-0.044381,C148,C


#### Now, preprocess the categorial data

In [49]:
cat_cols = titanic_df.select_dtypes(include=['object']).columns
encoder = OneHotEncoder()
titanic_encoded = encoder.fit_transform(titanic_df[cat_cols]).toarray()
titanic_encoded = pd.DataFrame(titanic_encoded, columns=encoder.get_feature_names_out())
titanic_df = pd.concat([titanic_df, titanic_encoded], axis=1)
titanic_df = titanic_df.drop(cat_cols, axis=1)
display(titanic_df)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,"Name_Abbing, Mr. Anthony","Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)",...,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Cabin_nan,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
0,-1.730108,-0.789272,0.827377,-0.592481,0.432793,-0.473674,-0.502445,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-1.726220,1.266990,-1.566107,0.638789,0.432793,-0.473674,0.786845,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-1.722332,1.266990,0.827377,-0.284663,-0.474545,-0.473674,-0.488854,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-1.718444,1.266990,-1.566107,0.407926,0.432793,-0.473674,0.420730,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-1.714556,-0.789272,0.827377,0.407926,-0.474545,-0.473674,-0.486337,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,1.714556,-0.789272,-0.369365,-0.207709,-0.474545,-0.473674,-0.386671,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
887,1.718444,1.266990,-1.566107,-0.823344,-0.474545,-0.473674,-0.044381,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
888,1.722332,-0.789272,0.827377,0.000000,0.432793,2.008933,-0.176263,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
889,1.726220,1.266990,-1.566107,-0.284663,-0.474545,-0.473674,-0.044381,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#### Divide the data into train & test sets

In [50]:
titanic_df['Survived'] = (titanic_df['Survived'] > titanic_df['Survived'].median()).astype(int)
X = titanic_df.drop('Survived', axis=1)
y = titanic_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Apply machine learning models

#### **🌟 Linear Regression**

In [51]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_score = lr.score(X_test, y_test)
print(f'Linear Regression R-squared: {lr_score:.2f}')

Linear Regression R-squared: 0.51


#### **🌟 Logistic Regression**

In [52]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_clf_score = lr_clf.score(X_test, y_test)
print(f'Logistic Regression Accuracy: {lr_clf_score:.2f}')

Logistic Regression Accuracy: 0.82


#### **🌟 Random Forest Regressor**

In [53]:
rf_reg = RandomForestRegressor(random_state=42)
param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 15]}
grid_search = GridSearchCV(rf_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train)
rf_reg_score = grid_search.score(X_test, y_test)
print(f'Random Forest Regressor R-squared: {rf_reg_score:.2f}')

Random Forest Regressor R-squared: 0.44


#### **🌟Random Forest Classifier**

In [54]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Classifier Accuracy: {accuracy:.2f}')

Random Forest Classifier Accuracy: 0.84
