<a href="https://colab.research.google.com/github/rahmani3101/Machine-Learning-/blob/main/Data_Pre_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

## Section 1: Data Loading

In [None]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

In [None]:
# 2. Display the first five rows of the dataset
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [None]:
df.size

10692

In [None]:
# 3. Display the shape, column names, and data types of the dataset
print("Shape of the dataset:", df.shape)
print("Column Names:", df.columns)
print("Data Types:", df.dtypes)


Shape of the dataset: (891, 12)
Column Names: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Data Types: PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


# Section 2: Handling Missing Values

In [None]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [None]:
# 5. Fill missing values in the Age column using mean imputation
df['Age'].fillna(df['Age'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)


In [None]:
# 6. Fill missing values in the Fare column using median imputation
df['Fare'].fillna(df['Fare'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(df['Fare'].median(), inplace=True)


In [None]:
# 7. Fill missing values in the Embarked column using mode imputation
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [None]:
# 8

df['Cabin'].fillna('Unknown',inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna('Unknown',inplace = True)


In [None]:
# 9. Apply forward fill on the Age column
df['Age'] = df['Age'].fillna(method='ffill')

  df['Age'] = df['Age'].fillna(method='ffill')


In [None]:
# 10. Apply backward fill on the Age column
df['Age'] = df['Age'].fillna(method='bfill')
print("\nUpdated Dataset:")
print(df.head())


Updated Dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare    Cabin Embarked  
0      0         A/5 21171   7.2500  Unknown        S  
1      0          PC 17599  71.2833      C85        C  
2      0  STON/O2. 3101282   7.9250  Unknown        S  
3      0            113803  53.1000     C123        S  
4      0            37

  df['Age'] = df['Age'].fillna(method='bfill')


# Section 3: Encoding Categorical Variables

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()

# Apply label encoding to 'sex' column
df['Sex'] = le.fit_transform(df['Sex'])


In [None]:
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

# Section 4 : Feature Scaling

In [None]:
df['age_minmax'] = (df['Age'] - df['Age'].min()) / (df['Age'].max() - df['Age'].min())

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_Q,Embarked_S,age_minmax
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,Unknown,False,True,0.271174
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,False,False,0.472229
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,Unknown,False,True,0.321438
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,False,True,0.434531
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,Unknown,False,True,0.434531


In [None]:
# Z score standardization
df['fare_z'] = (df['Fare'] - df['Fare'].mean()) / df['Fare'].std()

In [None]:
# Robust Scaling
df['fare_robust'] = (df['Fare'] - df['Fare'].quantile(0.25)) / (df['Fare'].quantile(0.75) - df['Fare'].quantile(0.25))

In [None]:
from sklearn.preprocessing import MaxAbsScaler

# Initialize the MaxAbs scaler
max_abs_scaler = MaxAbsScaler()

# Apply Max-Abs scaling to 'fare' column
df['fare_maxabs'] = max_abs_scaler.fit_transform(df[['Fare']])


In [None]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_Q,Embarked_S,age_minmax,fare_z,fare_robust,fare_maxabs
0,1,0,3,"Braund, Mr. Owen Harris",1,22.000000,1,0,A/5 21171,7.2500,Unknown,False,True,0.271174,-0.502163,-0.028602,0.014151
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.000000,1,0,PC 17599,71.2833,C85,False,False,0.472229,0.786404,2.744651,0.139136
2,3,1,3,"Heikkinen, Miss. Laina",0,26.000000,0,0,STON/O2. 3101282,7.9250,Unknown,False,True,0.321438,-0.488580,0.000632,0.015469
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.000000,1,0,113803,53.1000,C123,False,True,0.434531,0.420494,1.957141,0.103644
4,5,0,3,"Allen, Mr. William Henry",1,35.000000,0,0,373450,8.0500,Unknown,False,True,0.434531,-0.486064,0.006046,0.015713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.000000,0,0,211536,13.0000,Unknown,False,True,0.334004,-0.386454,0.220428,0.025374
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.000000,0,0,112053,30.0000,B42,False,True,0.233476,-0.044356,0.956690,0.058556
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,29.699118,1,2,W./C. 6607,23.4500,Unknown,False,True,0.367921,-0.176164,0.673013,0.045771
889,890,1,1,"Behr, Mr. Karl Howell",1,26.000000,0,0,111369,30.0000,C148,False,False,0.321438,-0.044356,0.956690,0.058556


In [None]:
# Calculate the IQR for 'age' column
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1

# Define the outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers
df = df[(df['Age'] >= lower_bound) & (df['Age'] <= upper_bound)]


In [None]:
from scipy.stats import zscore

# Calculate Z-scores for the 'age' column
z_scores = zscore(df['Age'])

# Define the threshold for Z-scores
threshold = 3

# Filter out the outliers
df = df[(z_scores < threshold) & (z_scores > -threshold)]


### 6 Create a new column family_size = sibsp + parch + 1.

In [None]:
# 18. Create family_size
df['family_size'] = df['SibSp'] + df['Parch'] + 1

In [None]:
# 19. Create fare_per_person
df['fare_per_person'] = df['Fare'] / df['family_size']

# Section 7: Correlation Analysis

In [None]:

corr_cols = ['Age', 'Fare', 'family_size', 'Survived']
print("\nCorrelation matrix:\n", df[corr_cols].corr())


Correlation matrix:
                   Age      Fare  family_size  Survived
Age          1.000000  0.091278    -0.190345 -0.027320
Fare         0.091278  1.000000     0.209281  0.269672
family_size -0.190345  0.209281     1.000000  0.016065
Survived    -0.027320  0.269672     0.016065  1.000000


# Section 8: Cleanup for Modeling

In [None]:
df.drop(['Name', 'Ticket', 'PassengerId', 'Cabin'], axis=1, inplace=True)

In [None]:
print("\nFinal dataset shape after cleanup:", df.shape)
print("\nFinal dataset preview:")
print(df.head())


Final dataset shape after cleanup: (825, 15)

Final dataset preview:
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked_Q  Embarked_S  \
0         0       3    1  22.0      1      0   7.2500       False        True   
1         1       1    0  38.0      1      0  71.2833       False       False   
2         1       3    0  26.0      0      0   7.9250       False        True   
3         1       1    0  35.0      1      0  53.1000       False        True   
4         0       3    1  35.0      0      0   8.0500       False        True   

   age_minmax    fare_z  fare_robust  fare_maxabs  family_size  \
0    0.271174 -0.502163    -0.028602     0.014151            2   
1    0.472229  0.786404     2.744651     0.139136            2   
2    0.321438 -0.488580     0.000632     0.015469            1   
3    0.434531  0.420494     1.957141     0.103644            2   
4    0.434531 -0.486064     0.006046     0.015713            1   

   fare_per_person  
0          3.62500  
1   