In [54]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer # For handling missing values

In [55]:
# 2. Load Data
# Assuming 'train.csv' and 'test.csv' are in the same directory as your notebook
train_df = pd.read_csv("D:/AR/classfication/train (1).csv")

In [56]:
print(train_df)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [57]:
test_df = pd.read_csv("D:/AR/classfication/test.csv")

In [58]:
print(test_df)

     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Cabin Embarked  
0      male  34.5      0      0 

In [59]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [60]:
# Combine for consistent preprocessing (optional, but good practice)
# Make a copy to avoid SettingWithCopyWarning
combined_df = pd.concat([train_df.drop('Survived', axis=1), test_df], ignore_index=True)
# Store PassengerId for submission later
passenger_ids = test_df['PassengerId']

In [61]:
print(passenger_ids)

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64


In [62]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [63]:
print("\nMissing values in training data:")
print(train_df.isnull().sum())


Missing values in training data:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [64]:
# 3. Data Preprocessing

# Drop 'Cabin' (too many missing values) and 'Ticket' (complex to use)
combined_df = combined_df.drop(['Cabin', 'Ticket'], axis=1)


In [65]:
print(combined_df)

      PassengerId  Pclass                                               Name  \
0               1       3                            Braund, Mr. Owen Harris   
1               2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2               3       3                             Heikkinen, Miss. Laina   
3               4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4               5       3                           Allen, Mr. William Henry   
...           ...     ...                                                ...   
1304         1305       3                                 Spector, Mr. Woolf   
1305         1306       1                       Oliva y Ocana, Dona. Fermina   
1306         1307       3                       Saether, Mr. Simon Sivertsen   
1307         1308       3                                Ware, Mr. Frederick   
1308         1309       3                           Peter, Master. Michael J   

         Sex   Age  SibSp  Parch      F

In [66]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1046 non-null   float64
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Fare         1308 non-null   float64
 8   Embarked     1307 non-null   object 
dtypes: float64(2), int64(4), object(3)
memory usage: 92.2+ KB


In [67]:
# Impute 'Age' with the median
imputer_age = SimpleImputer(strategy='median')
combined_df['Age'] = imputer_age.fit_transform(combined_df[['Age']])

In [68]:
print(combined_df.isnull().sum())

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Embarked       2
dtype: int64


In [69]:
#impute 'Fare' with the median (only 'test_df' might have missing Fare)
imputer_fare = SimpleImputer(strategy='median')
combined_df['Fare'] = imputer_fare.fit_transform(combined_df[['Fare']])

In [70]:
# Calculate the mode (most frequent value) of the 'Embarked' column
# .iloc[0] is used because mode() can return multiple values if there's a tie
most_frequent_embarked = combined_df['Embarked'].mode()[0]

# Fill missing values in 'Embarked' with the calculated mode
combined_df['Embarked'].fillna(most_frequent_embarked, inplace=True)

# Verify no more missing values
print(combined_df['Embarked'].isnull().sum())

0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['Embarked'].fillna(most_frequent_embarked, inplace=True)


In [71]:

# Verify no more missing values
print(combined_df['Embarked'].isnull().sum())

0


In [72]:
# Feature Engineering: FamilySize and IsAlone
combined_df['FamilySize'] = combined_df['SibSp'] + combined_df['Parch'] + 1
combined_df['IsAlone'] = (combined_df['FamilySize'] == 1).astype(int)

In [73]:
# Feature Engineering: Extract Title from Name
combined_df['Title'] = combined_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [74]:
print(combined_df)

      PassengerId  Pclass                                               Name  \
0               1       3                            Braund, Mr. Owen Harris   
1               2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2               3       3                             Heikkinen, Miss. Laina   
3               4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4               5       3                           Allen, Mr. William Henry   
...           ...     ...                                                ...   
1304         1305       3                                 Spector, Mr. Woolf   
1305         1306       1                       Oliva y Ocana, Dona. Fermina   
1306         1307       3                       Saether, Mr. Simon Sivertsen   
1307         1308       3                                Ware, Mr. Frederick   
1308         1309       3                           Peter, Master. Michael J   

         Sex   Age  SibSp  Parch      F

In [75]:
# Group rare titles
rare_titles = ['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Mlle', 'Ms', 'Mme']
combined_df['Title'] = combined_df['Title'].replace(rare_titles, 'Rare')
combined_df['Title'] = combined_df['Title'].replace('Mlle', 'Miss') # Correcting common error: Mlle is Miss
combined_df['Title'] = combined_df['Title'].replace('Ms', 'Miss')
combined_df['Title'] = combined_df['Title'].replace('Mme', 'Mrs')

In [76]:
#Drop original 'Name', 'PassengerId', 'SibSp', 'Parch' as we've used them to create new features or they are not directly useful
combined_df = combined_df.drop(['Name', 'PassengerId', 'SibSp', 'Parch'], axis=1)

In [77]:
combined_df.columns

Index(['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize', 'IsAlone',
       'Title'],
      dtype='object')

In [78]:
#Encode Categorical Features
# Use one-hot encoding for 'Sex', 'Embarked', 'Pclass', 'Title'
# drop_first=True avoids multicollinearity
combined_df = pd.get_dummies(combined_df, columns=['Sex', 'Embarked', 'Pclass', 'Title'], drop_first=True)

In [79]:
print(combined_df)

       Age      Fare  FamilySize  IsAlone  Sex_male  Embarked_Q  Embarked_S  \
0     22.0    7.2500           2        0      True       False        True   
1     38.0   71.2833           2        0     False       False       False   
2     26.0    7.9250           1        1     False       False        True   
3     35.0   53.1000           2        0     False       False        True   
4     35.0    8.0500           1        1      True       False        True   
...    ...       ...         ...      ...       ...         ...         ...   
1304  28.0    8.0500           1        1      True       False        True   
1305  39.0  108.9000           1        1     False       False       False   
1306  38.5    7.2500           1        1      True       False        True   
1307  28.0    8.0500           1        1      True       False        True   
1308  28.0   22.3583           3        0      True       False       False   

      Pclass_2  Pclass_3  Title_Miss  Title_Mr  Tit

In [80]:
# Separate back into training and test sets
X_train_processed = combined_df.iloc[:len(train_df)]
X_test_processed = combined_df.iloc[len(train_df):]
y_train = train_df['Survived'] # The target variable for training

In [81]:
X_train_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 891 non-null    float64
 1   Fare                891 non-null    float64
 2   FamilySize          891 non-null    int64  
 3   IsAlone             891 non-null    int64  
 4   Sex_male            891 non-null    bool   
 5   Embarked_Q          891 non-null    bool   
 6   Embarked_S          891 non-null    bool   
 7   Pclass_2            891 non-null    bool   
 8   Pclass_3            891 non-null    bool   
 9   Title_Miss          891 non-null    bool   
 10  Title_Mr            891 non-null    bool   
 11  Title_Mrs           891 non-null    bool   
 12  Title_Rare          891 non-null    bool   
 13  Title_the Countess  891 non-null    bool   
dtypes: bool(10), float64(2), int64(2)
memory usage: 36.7 KB


In [82]:
print("\nProcessed Training Data Head:")
print(X_train_processed.head())
print("\nProcessed Training Data Info:")
X_train_processed.info()



Processed Training Data Head:
    Age     Fare  FamilySize  IsAlone  Sex_male  Embarked_Q  Embarked_S  \
0  22.0   7.2500           2        0      True       False        True   
1  38.0  71.2833           2        0     False       False       False   
2  26.0   7.9250           1        1     False       False        True   
3  35.0  53.1000           2        0     False       False        True   
4  35.0   8.0500           1        1      True       False        True   

   Pclass_2  Pclass_3  Title_Miss  Title_Mr  Title_Mrs  Title_Rare  \
0     False      True       False      True      False       False   
1     False     False       False     False       True       False   
2     False      True        True     False      False       False   
3     False     False       False     False       True       False   
4     False      True       False      True      False       False   

   Title_the Countess  
0               False  
1               False  
2               False  
3

In [83]:
# 4. Model Training

# Initialize the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [84]:

# Train the model
model.fit(X_train_processed, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [85]:
# Make predictions on the training data (for internal evaluation, not for submission to Kaggle)
y_train_pred = model.predict(X_train_processed)
print(f"\nTraining Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print("\nTraining Classification Report:")
print(classification_report(y_train, y_train_pred))


Training Accuracy: 0.9820

Training Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       549
           1       0.99      0.96      0.98       342

    accuracy                           0.98       891
   macro avg       0.98      0.98      0.98       891
weighted avg       0.98      0.98      0.98       891



In [86]:
# Perform Cross-Validation (highly recommended for robust evaluation)
cv_scores = cross_val_score(model, X_train_processed, y_train, cv=5, scoring='accuracy')
print(f"\nCross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")
print(f"Std CV Accuracy: {cv_scores.std():.4f}")


Cross-Validation Accuracy Scores: [0.78212291 0.78651685 0.85955056 0.75842697 0.81460674]
Mean CV Accuracy: 0.8002
Std CV Accuracy: 0.0346


In [87]:
#You can also make predictions on the *test.csv* data (for Kaggle submission)
# and evaluate if you had a 'Survived' column for it (which you don't in Kaggle's public test set)
# y_test_pred = model.predict(X_test_processed)
# print(f"\nTest Set Predictions (for submission):\n{y_test_pred[:10]}") # Show first 10 predictions

# 6. Generate Submission File (for Kaggle)

# Predict on the actual test dataset (the one without 'Survived' column)
kaggle_predictions = model.predict(X_test_processed)


In [88]:
# Create the submission DataFrame
submission_df = pd.DataFrame({
    'PassengerId': passenger_ids, # Use the stored PassengerIds from the original test_df
    'Survived': kaggle_predictions
})

# Save to CSV
submission_df.to_csv('submission.csv', index=False)
print("\nSubmission file 'submission.csv' created successfully!")
print(submission_df.head())


Submission file 'submission.csv' created successfully!
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         1
4          896         1
