# Data connection

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

## Loading Training & Testing Data

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Data Exploration

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
# check train_df using head()
train_df.head()

In [None]:
train_df.shape

In [None]:
# check the column names and total columns
print(train_df.columns)
print(len(train_df.columns))

In [None]:
# check the info about the train_df
train_df.info()

In [None]:
# check the statistical data of the train_df
train_df.describe()

In [None]:
# check the unique values of label 'outcome'
print("Unique values are: {}".format(train_df['outcome'].unique()))
print("Total unique values: {}".format(train_df['outcome'].nunique()))

In [None]:
# check the numerical & categorical columns
cat_col = train_df.copy().drop(columns=['outcome']).select_dtypes(include=['object', 'bool']).columns.to_list()

num_col = train_df.copy().select_dtypes(include='number').columns.to_list()

print('Categorical Columns: {}\n\n'.format(cat_col))
print('Numerical Columns: {}'.format(num_col))

# # create separate numerical and categorical dataframe
# cat_col_df = train_df[cat_col]
# num_col_df = train_df[num_col]

In [None]:
# # Plot the distribution of the "outcome" in train_df
# plt.figure(figsize=(10, 5))
# plt.subplot(1, 2, 1)
# train_df['outcome'].value_counts().plot(kind='bar')
# plt.title('Distribution of Outcome in Train Data')

# plt.tight_layout()
# plt.show()

In [None]:

# # Separate the feature columns and the "outcome" column
# X_train = train_df.copy().drop(columns=["outcome"])
# y_train = train_df["outcome"]

# # Under-sampling the "outcome" column
# under_sampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
# X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)

# # # Over-sampling the "outcome" column
# # over_sampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
# # X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)

# # Create DataFrames with balanced "outcome" columns
# under_sampled_df = pd.concat([X_train_under, y_train_under], axis=1)

# # over_sampled_df = pd.concat([X_train_over, y_train_over], axis=1)

In [None]:
# # Plot the distribution of the "outcome" in under_sampled_df
# plt.figure(figsize=(10, 5))
# plt.subplot(1, 2, 1)
# under_sampled_df['outcome'].value_counts().plot(kind='bar')
# plt.title('Distribution of Outcome in under_sampled_df')


# # # Plot the distribution of the "outcome" in over_sampled_df
# # plt.figure(figsize=(10, 5))
# # plt.subplot(1, 2, 1)
# # over_sampled_df['outcome'].value_counts().plot(kind='bar')
# # plt.title('Distribution of Outcome in over_sampled_df')

# plt.tight_layout()
# plt.show()

In [None]:
# train_df = under_sampled_df.copy()

In [None]:
# List of columns
columns = train_df.copy().drop(columns='outcome').columns.to_list()

# create histograms for 'outcome' vs each of the other columns
for column in columns:
    plt.figure(figsize=(7, 3))
    sns.histplot(data=train_df, x=column, hue='outcome', kde=True)
    plt.title(f'{column} vs outcome')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.show()

In [None]:
# Check distribution of column data. we have numerical col names in num_col variable

plt.figure(figsize=(20, 35))
#columns = train_df.columns.tolist()

for i, column in enumerate(num_col, 1):
    plt.subplot(7, 2, i)
    sns.histplot(train_df[column], color="green", kde=True)
    plt.title(f'Distribution of {column}')
    plt.ylim()

plt.tight_layout()
plt.show()

print(num_col)

In [None]:
# apply log transformation to the columns that are right skewed
# column_skew = ['hospital_number', 'pulse', 'respiratory_rate', 'total_protein', 'abdomo_protein', 
#                'lesion_1', 'lesion_2', 'lesion_3' ]

column_skew = [ 'pulse', 'respiratory_rate', 'total_protein', 'abdomo_protein']

for value in column_skew:
    train_df[value] = np.log(1+train_df[value])
    test_df[value]= np.log(1+test_df[value])

    
# plt.figure(figsize=(20, 35))
# #columns = train_df.columns.tolist()

# for i, column in enumerate(num_col, 1):
#     plt.subplot(7, 2, i)
#     sns.histplot(train_df[column], color="green", kde=True)
#     plt.title(f'Distribution of {column}')
#     plt.ylim()

# plt.tight_layout()
# plt.show()

In [None]:


# Define the columns you want to apply the Box-Cox transformation to
columns_to_transform = ['hospital_number', 'lesion_1', 'lesion_2', 'lesion_3']  

# # Create a new DataFrame to store the transformed data
# transformed_df = train_df.copy()

# Apply the Box-Cox transformation to each specified column
for column in columns_to_transform:
    # Add a small constant to handle zero and negative values if necessary
    train_df[column] = stats.boxcox(train_df[column] + 1)[0]

In [None]:
plt.figure(figsize=(20, 35))
#columns = train_df.columns.tolist()

for i, column in enumerate(num_col, 1):
    plt.subplot(7, 2, i)
    sns.histplot(train_df[column], color="green", kde=True)
    plt.title(f'Distribution of {column}')
    plt.ylim()

plt.tight_layout()
plt.show()

In [None]:
# drop hospital_number, lesion_2 and lesion_3
train_df = train_df.copy().drop(columns=['lesion_2', 'lesion_3', 'hospital_number'])
test_df = test_df.copy().drop(columns=['lesion_2', 'lesion_3', 'hospital_number'])

#update num_col list
columns_to_remove = ['lesion_2', 'lesion_3', 'hospital_number']
num_col = [col for col in num_col if col not in columns_to_remove]
print(num_col)

# Dealing with Missing Data

In [None]:
print(train_df.isna().sum())
train_df

In [None]:
# percentage of missing values per column

missing_percentage = (train_df.isnull().sum() / len(train_df)) * 100

missing_df = pd.DataFrame({'Missing Percentage': missing_percentage})
missing_df

In [None]:
def fill_missing_values(df):
    
    # To fill categorical features. 
    imputer1 = SimpleImputer(strategy="most_frequent") 
    
    # To fill numercial features.
    imputer2 = SimpleImputer(strategy="median")
    
    df[cat_col] = imputer1.fit_transform(df[cat_col]) 
    df[num_col] = imputer2.fit_transform(df[num_col])

# use the function to fill the missing values    
fill_missing_values(train_df) 
fill_missing_values(test_df)

# check again the missing values
print("Missing numbers left in train_df is:",train_df.isnull().sum().sum()) 
print("Missing numbers left in test_df is:",test_df.isnull().sum().sum())

In [None]:
print("Duplicate values in training data is: ",train_df.duplicated().sum()) 
print("Duplicate values in testing data is: ",test_df.duplicated().sum())

In [None]:
print("Cardinality of features in numerical data is: ") 
print(train_df.select_dtypes(include=["object"]).nunique()) 
print("\n","-"*50)
print("\nCardinality of features in categorical data is: ") 
print(test_df.select_dtypes(include=["object"]).nunique())

In [None]:
print(train_df.head())

In [None]:
# # change data types to boolean
# bool_col = ['surgery', 'surgical_lesion', 'cp_data']
# train_df[bool_col] = train_df[bool_col].astype(bool)

# define ordinal and nominal categorical column
ordinal_cat_col = ['age', 'temp_of_extremities', 'peripheral_pulse', 'capillary_refill_time', 'rectal_exam_feces', 'abdomen']
nominal_cat_col = [item for item in cat_col if item not in ordinal_cat_col]

print("Nominal Categorical Columns: ", nominal_cat_col, "\n\n")
print("Ordinal Categorical Columns: ", ordinal_cat_col, "\n\n")
# print("Boolean Columns: ", bool_col)

# Encoding 

In [None]:
# apply label encoder to ordinal_cat_col
enc = LabelEncoder()

train_df[ordinal_cat_col]=train_df[ordinal_cat_col].apply(enc.fit_transform)
test_df[ordinal_cat_col]=test_df[ordinal_cat_col].apply(enc.fit_transform)

In [None]:
# apply one hot encoding to nominal_cat_col
train_df = pd.get_dummies(train_df, columns = nominal_cat_col, drop_first = True) 
test_df = pd.get_dummies(test_df, columns= nominal_cat_col, drop_first = True)

In [None]:
# Use the replace function to replace values in the 'outcome' column
train_df['outcome'].replace({'died': 0, 'euthanized': 1, 'lived': 2}, inplace=True)

* After encoding 'pain_moderate' column is missing in train_df and ['nasogastric_reflux_slight', 'pain_slight', 'peristalsis_distend_small', 'rectal_exam_feces_4'] these column missing in train_df. 

In [None]:
# drop 'pain_moderate' in train_df
test_df = test_df.drop(columns=['pain_moderate'])

# drop ['nasogastric_reflux_slight', 'pain_slight', 'peristalsis_distend_small', 'rectal_exam_feces_4'] in test_df
train_df = train_df.drop(columns = ['nasogastric_reflux_slight', 'pain_slight', 'peristalsis_distend_small'])

# Split Dataset

In [None]:
# Separate the feature columns and the "outcome" column
X = train_df.copy().drop(columns=["id", "outcome"])
y = train_df["outcome"]

# Under-sampling the "outcome" column
under_sampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X, y = under_sampler.fit_resample(X, y)

In [None]:
# define input features and label 
# X = train_df.copy().drop(columns=['id', 'outcome'])
# y = train_df['outcome']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

X_test = test_df.drop(columns=['id'])


#  Feature Scaling

In [None]:
X_train

In [None]:
X_test

In [None]:
# y_train

In [None]:
# initiate standard scaler
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_temp = sc.transform(X_temp)

# Model Selection

In [None]:
lazypredict.Supervised.CLASSIFIERS

In [None]:
clf = LazyClassifier(verbose= 1,ignore_warnings=True, custom_metric= None)
models, predictions = clf.fit(X_train, X_temp, y_train, y_temp)
models

In [None]:
# Initialize the LGBMClassifier model
lgbm_model = LGBMClassifier()

# Fit (train) the model on the training data
lgbm_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = lgbm_model.predict(X_temp)

# Calculate the micro-average F1 score
micro_f1 = f1_score(y_temp, y_pred, average='micro')

print("Micro-average F1 Score:", micro_f1)

In [None]:
# Initialize the RandomForest model
rf = RandomForestClassifier()

# Fit (train) the model on the training data
rf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf.predict(X_temp)

# Calculate the micro-average F1 score
micro_f1 = f1_score(y_temp, y_pred, average='micro')

print("Micro-average F1 Score:", micro_f1)

In [None]:
# train the model on full data
sc2 = StandardScaler()
X_train = X
y_train = y

X_train = sc2.fit_transform(X_train)
X_test = sc2.transform(X_test)

In [None]:
# fit and predict with full train data with random forest because it gives better result

rf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf.predict(X_test)

In [None]:
y_pred

In [None]:
# Create a dictionary for mapping
outcome_mapping = {0: 'died', 1: 'euthanized', 2: 'lived'}

# Convert the numeric predictions in y_pred to their corresponding labels
outcome_labels = [outcome_mapping[pred] for pred in y_pred]

# Create the 'submission' DataFrame
submission = pd.DataFrame({'id': test_df['id'].astype(int), 'outcome': outcome_labels})

# Display the 'submission' DataFrame
print(submission) 

In [None]:
# Save the 'submission' DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

In [None]:
# Count the occurrences of each category in the 'outcome' column
outcome_counts = submission['outcome'].value_counts()

# Create a bar plot
outcome_counts.plot(kind='bar', rot=0)
plt.xlabel('Outcome')
plt.ylabel('Count')
plt.title('Distribution of Outcomes')
plt.show()