In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix, classification_report

## • Funtions

In [None]:
def histogram_rep(df,col_name, total_rows,x_label, x_labels, title, figsize):
    numeric_data = pd.to_numeric(df[col_name], errors='coerce')
    nan_count = numeric_data.isnull().sum()
    percentage_non_numeric = (nan_count / total_rows) * 100
    print(f"Percentage of non-numerical values in the '{col_name}' column: {percentage_non_numeric:.2f}%")

    # Get unique values in the 'previous' column
    unique_values = df[col_name].unique()
    print("Unique values in '{col_name}' column:")
    
    # Calculate the percentage of each unique value
    percentage_values = []
    for value in unique_values:
        count = (df[col_name] == value).sum()
        percentage = (count / total_rows) * 100
        percentage_values.append(percentage)
        print(f"Value: {value}, Percentage: {percentage:.2f}%")
        
    # Create a histogram
    plt.figure(figsize=figsize)
    plt.bar(unique_values, percentage_values)
    plt.xlabel(x_label)
    plt.ylabel('Percentage')
    plt.title(title)
    plt.xticks(unique_values, x_labels)
    plt.show()

In [None]:
def survival_percentage(data, column_name):
    unique_values = data[column_name].unique()
    
    percentages = {}
    
    for value in unique_values:
        subset = data[data[column_name] == value]
        survived_count = subset["Survived"].sum()
        total_count = len(subset)
        if total_count > 0:
            percentage = (survived_count / total_count) * 100
            percentages[value] = percentage
    
    return percentages


In [None]:
def survival_percentage_continuous(data, column_name, num_bins=None, bin_labels=None):
    if num_bins is None:
        num_bins = 10  # Default number of bins if not specified
    
    if bin_labels is None:
        bin_labels = [f'Bin {i+1}' for i in range(num_bins)]  # Default bin labels
    
    # Create bins for the specified column
    data['bins'] = pd.cut(data[column_name], bins=num_bins, labels=bin_labels)
    
    # Calculate survival percentages for each bin
    bin_percentages = {}
    
    for bin_label in bin_labels:
        subset = data[data['bins'] == bin_label]
        survived_count = subset['Survived'].sum()
        total_count = len(subset)
        
        if total_count > 0:
            percentage = (survived_count / total_count) * 100
            bin_percentages[bin_label] = percentage
    
    # Remove the 'bins' column from the DataFrame (optional)
    data.drop(columns=['bins'], inplace=True)
    
    # Create a bar chart to visualize the survival percentages
    plt.figure(figsize=(10, 6))
    plt.bar(bin_percentages.keys(), bin_percentages.values(), color='skyblue')
    plt.xlabel('Bins')
    plt.ylabel('Survival Percentage (%)')
    plt.title(f'Survival Percentage by {column_name}')
    plt.xticks(rotation=45)
    plt.grid(axis='y')
    plt.show()
    
    return bin_percentages

In [None]:
def draw_pie_chart(percentages):
    labels = percentages.keys()
    sizes = list(percentages.values())  # Convert dict_values to a list
    
    # Generate a list of shades of blue based on the number of categories
    num_categories = len(labels)
    colors = plt.cm.Blues(np.linspace(0.1, 1, num_categories))
    
    plt.figure(figsize=(8, 6))  # Adjust the figure size as needed
    plt.pie(sizes, labels=None, colors=colors, autopct=lambda p: f'{p:.1f}%' if p > 0 else '', startangle=140)
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.title('Survival Percentage by Category')
    
    # Add custom legend
    legend_labels = [f'{label} ({sizes[i]:.1f}%)' for i, label in enumerate(labels) if sizes[i] > 0]
    plt.legend(legend_labels, loc='best')
    
    plt.show()

In [None]:
def find_between(s, first, last):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

In [None]:
def percent(col_name, total_rows):
    numeric_data = pd.to_numeric(df[col_name], errors='coerce')
    nan_count = numeric_data.isnull().sum()
    percentage_non_numeric = (nan_count / total_rows) * 100
    print(f"Percentage of non-numerical values in the '{col_name}' column: {percentage_non_numeric:.2f}%")

    # Get unique values in the 'previous' column
    unique_values = df[col_name].unique()
    print("Unique values in '{col_name}' column:")
    
    # Calculate the percentage of each unique value
    percentage_values = []
    for value in unique_values:
        count = (df[col_name] == value).sum()
        percentage = (count / total_rows) * 100
        percentage_values.append(percentage)
        print(f"Value: {value}, Percentage: {percentage:.2f}%")

In [None]:
def get_null_percentage(dataframe):
    # Calculate the total number of rows in the DataFrame
    total_rows = len(dataframe)
    
    # Calculate the percentage of null values for each feature
    null_percentage = (dataframe.isnull().sum() / total_rows) * 100
    
    return null_percentage

In [None]:
# Define a function to extract the ticket category
def extract_ticket_category(ticket):
    if pd.isna(ticket):
        return None
    elif ' ' in ticket:
        return ticket.split(' ')[0]
    else:
        return 'Normal'

In [None]:
def fill_null_with_mode(dataframe, column_name):
    # Calculate the mode of the column
    mode_value = dataframe[column_name].mode().iloc[0]
    
    # Fill null values with the mode
    dataframe[column_name].fillna(mode_value, inplace=True)

## Read the dataset

In [None]:
df = pd.read_csv('/kaggle/input/test-file/tested.csv')

# • Data Visualization & Pre-processing

In [None]:
print(df.head)

In [None]:
df.describe()

In [None]:
print(df.info)

In [None]:
features = ['Pclass','Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked','Survived']
null_percentage = get_null_percentage(df[features])

# Print the null percentage for each feature
print("Percentage of null values for each feature:")
print(null_percentage,"%")

In [None]:
df.drop(columns=['Cabin'], inplace=True)
df.drop(columns=['Age'], inplace=True)

In [None]:
# Remove 'Cabin' and 'Age' from the features list in one line
features = [feature for feature in features if feature not in ['Cabin', 'Age']]

# Print the updated features list
print(features)

### 1) Sex

In [None]:
sex_result = survival_percentage(df, "Sex")

for key, value in sex_result.items():
    print(f"precentage of {key} who survived: {value:.2f}%")
    
draw_pie_chart(sex_result)

Thearfore, included

### 2) Name --> Title

In [None]:
df['Title'] = df.apply(lambda row: find_between(row['Name'], ", ", "."), axis=1)
features.append('Title')
df.drop(columns=['Name'], inplace=True)
print(df['Title'])

In [None]:
# Remove 'Name' from the features list
features.remove('Name')

# Print the updated features list
print(features)

In [None]:
col_name = 'Title'
x_label = 'Title'
title = 'Titles percentages'
figsize = (9, 3)
x_labels = ['Mr', 'Mrs', 'Miss','Master','Ms','Col', 'Rev', 'Dr', 'Dona']

histogram_rep(df,col_name, len(df), x_label, x_labels, title, figsize)

In [None]:
title_result = survival_percentage(df, "Title")

for key, value in title_result.items():
    print(f"precentage of {key} who survived: {value:.2f}%")
    
draw_pie_chart(title_result)        #others?????

##### Same as all 'female' survived in 'Sex' column
##### So, this column won't be useful in our prediction
##### Therefore, not included

In [None]:
df.drop(columns=['Title'], inplace=True)
features.remove('Title')
print(features)

In [None]:
df.head()

### 3) Embarked

In [None]:
col_name = 'Embarked'
x_label = 'Embarked'
x_labels = ['Q','S','C']
title = 'Embarked percentage'
figsize = (3,3)

histogram_rep(df,col_name, len(df),x_label, x_labels, title, figsize)

In [None]:
embarked_result = survival_percentage(df, "Embarked")

for key, value in embarked_result.items():
    print(f"precentage of {key} who survived: {value:.2f}%")
    
draw_pie_chart(embarked_result)

### 4) Pclass

In [None]:
col_name = 'Pclass'
x_label = 'Pclass'
x_labels = ['3rd class', '2nd class', '1st class']
title = 'Percentage of Pclass'
figsize = (3,4)

histogram_rep(df,col_name, len(df),x_label, x_labels, title, figsize)

In [None]:
embarked_result = survival_percentage(df, "Pclass")

for key, value in embarked_result.items():
    print(f"precentage of {key} who survived: {value:.2f}%")
    
draw_pie_chart(embarked_result)

## 5) SibSp

In [None]:
col_name = 'SibSp'
x_label = 'SibSp'
title = 'SibSp percentages'
figsize = (4, 3)
x_labels = ['0','1','2','3','4','5','8']

histogram_rep(df,col_name, len(df), x_label, x_labels, title, figsize)

In [None]:
SibSp_result = survival_percentage(df, "SibSp")

for key, value in SibSp_result.items():
    print(f"precentage of {key} who survived: {value:.2f}%")
    
draw_pie_chart(SibSp_result)

# 6) Parch

In [None]:
col_name = 'Parch'
x_label = 'Parch'
title = 'Parch percentages'
figsize = (9, 3)
x_labels = ['0', '1', '3', '2', '4', '6','5','9']      # 8 classes

histogram_rep(df,col_name, len(df), x_label, x_labels, title, figsize)

In [None]:
Parch_result = survival_percentage(df, "Parch")

for key, value in Parch_result.items():
    print(f"precentage of {key} who survived: {value:.2f}%")
    
draw_pie_chart(Parch_result)

## 7) Ticket

In [None]:
# Split the 'Ticket' column and store the first part in 'Ticket_category'
df['Ticket_category'] = df['Ticket'].str.split(' ').str[0]

In [None]:
col_name = 'Ticket'
df['Ticket_category'] = df[col_name].apply(extract_ticket_category)
percent('Ticket_category', len(df))

In [None]:
df.drop(columns=['Ticket'], inplace=True)
features.remove('Ticket')
features.append('Ticket_category')
print(features)

In [None]:
# Calculate the count of each ticket category
ticket_category_counts = df['Ticket_category'].value_counts()

# Extract unique categories and their counts
categories = ticket_category_counts.index
counts = ticket_category_counts.values

# Create a scatter plot
plt.figure(figsize=(18, 6))
plt.scatter(categories, counts, c='blue', marker='o')
plt.xlabel('Ticket Category')
plt.ylabel('Count')
plt.title('Ticket Category Distribution')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

##### So, there is an outlier

In [None]:
df.drop(columns=['Ticket_category'], inplace=True)
features.remove('Ticket_category')
print(features)

## 8) Fare: A numerical variable for the price of the ticket.

In [None]:
col_name = 'Fare'
percent(col_name, len(df))
sns.kdeplot(data=df['Fare'], color='red')

### • pre-processing

In [None]:
print(df.head)

##### Now, there is non-numerical values
#####    To remove them we should use linear regression and predict its values from other columns
##### Or simply drop it

In [None]:
# Define the features to use for prediction
#use_features = ['Pclass', 'Sex_female', 'Sex_male', 'SibSp', 'Parch', 'Embarked_C', 'Embarked_Q', 'Embarked_S']

# Filter the DataFrame to only include rows with non-null 'Fare' values
#df_clean = df.dropna(subset=['Fare'])

# Create the feature matrix X and target variable y
#X = df_clean[use_features]
#y = df_clean['Fare']

# Create and fit the linear regression model
#model = LinearRegression()
#model.fit(X, y)

# Filter the DataFrame to only include rows with null 'Fare' values
#null_fare_indices = df[df['Fare'].isnull()].index

# Predict missing 'Fare' values using the model
#predicted_fares = model.predict(df.loc[null_fare_indices, use_features])

# Fill the null 'Fare' values with the predicted values
#df.loc[null_fare_indices, 'Fare'] = predicted_fares

In [None]:
#col_name = 'Fare'
#percent(col_name, len(df))
#sns.kdeplot(data=df['Fare'], color='red')

In [None]:
df.drop(columns=['Fare'], inplace=True)
features.remove('Fare')
print(features)

In [None]:
print(df.head)

## 9) Embarked: Alphanumerical cabin code.

In [None]:
col_name = 'Embarked'
x_label = 'Embarked'
title = 'Embarked percentages'
figsize = (3, 3)
x_labels = ['Q','S','C']

histogram_rep(df,col_name, len(df), x_label, x_labels, title, figsize)

In [None]:
Embarked_result = survival_percentage(df, "Embarked")

for key, value in Embarked_result.items():
    print(f"percentage of {key} who survived: {value:.2f}%")

draw_pie_chart(Embarked_result)

In [None]:
print(features)

In [None]:
use_features = ['Pclass', 'Sex', 'SibSp', 'Parch']
df = pd.get_dummies(features)

In [None]:
df.head()

In [None]:
X = df[use_features]
y = df['Survived']

In [None]:
X.info()
y.info()

# • Divide the data into train and test

In [None]:
# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# Split your data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# Create and fit the model
#model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
#model.fit(X_train, y_train)

# Make predictions on both train and test data
#train_predictions = model.predict(X_train)
#test_predictions = model.predict(X_test)

# Calculate train and test accuracy
#train_accuracy = accuracy_score(y_train, train_predictions)
#test_accuracy = accuracy_score(y_test, test_predictions)

# Print both train and test accuracy
#print(f"Train Accuracy: {train_accuracy:.2%}")
#print(f"Test Accuracy: {test_accuracy:.2%}")

In [None]:
# Define a list of models to try
models = [
    ("Random Forest", RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)),
    ("Gradient Boosting", GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=1)),
    ("Logistic Regression", LogisticRegression(random_state=1))
]

best_model = None
best_accuracy = 0.0

for model_name, model in models:
    print(f"Training {model_name}...")
    
    train_accuracy_list = []
    val_accuracy_list = []
    test_accuracy_list = []

    # Train the model and track accuracy
    for epoch in range(1, 101):  # You can adjust the number of epochs
        model.fit(X_train, y_train)

        train_predictions = model.predict(X_train)
        val_predictions = model.predict(X_val)  # Assuming you have a validation set X_val and y_val
        test_predictions = model.predict(X_test)

        train_accuracy = accuracy_score(y_train, train_predictions)
        val_accuracy = accuracy_score(y_val, val_predictions)
        test_accuracy = accuracy_score(y_test, test_predictions)

        train_accuracy_list.append(train_accuracy)
        val_accuracy_list.append(val_accuracy)
        test_accuracy_list.append(test_accuracy)

    # Print both train and test accuracy
    print(f"{model_name} Train Accuracy: {train_accuracy:.2%}")
    print(f"{model_name} Validation Accuracy: {val_accuracy:.2%}")
    print(f"{model_name} Test Accuracy: {test_accuracy:.2%}")
    print("\n")

    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_model = model_name
        
# Check if all models have similar accuracy
similar_models = all(val == best_accuracy for val in val_accuracy_list)
if similar_models:
    print("All models have similar performance.")
else:
    print(f"The best model is {best_model} with a validation accuracy of {best_accuracy:.2%}")

In [None]:
# Plotting train accuracy vs validation accuracy and train accuracy vs test accuracy
plt.figure(figsize=(15, 5))

# Train accuracy vs validation accuracy
plt.subplot(1, 2, 1)
plt.plot(train_accuracy_list, label='Train Accuracy', c='red')
plt.plot(val_accuracy_list, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Train Accuracy vs Validation Accuracy')
plt.legend()

# Train accuracy vs test accuracy
plt.subplot(1, 2, 2)
plt.plot(train_accuracy_list, label='Train Accuracy', c='red')
plt.plot(test_accuracy_list, label='Test Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Train Accuracy vs Test Accuracy')
plt.legend()

plt.show()

In [None]:
for model_name, model in models:
    print(f"Evaluating {model_name}...")
    
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    train_predictions = model.predict(X_train)
    val_predictions = model.predict(X_val)  # Assuming you have a validation set X_val and y_val
    test_predictions = model.predict(X_test)

    # Calculate confusion matrix and classification report for train data
    train_cm = confusion_matrix(y_train, train_predictions)
    train_cr = classification_report(y_train, train_predictions, output_dict=True, zero_division=1)
    
    # Calculate confusion matrix and classification report for validation data
    val_cm = confusion_matrix(y_val, val_predictions)
    val_cr = classification_report(y_val, val_predictions, output_dict=True, zero_division=1)

    # Calculate confusion matrix and classification report for test data
    test_cm = confusion_matrix(y_test, test_predictions)
    test_cr = classification_report(y_test, test_predictions, output_dict=True, zero_division=1)
    
    # Display confusion matrix as a heatmap for test data
    plt.figure(figsize=(8, 6))
    sns.heatmap(test_cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix - {model_name} (Test Data)')
    plt.show()

    # Print classification report for test data
    print(f"Classification Report for {model_name} - Test Data:")
    print(classification_report(y_test, test_predictions))