In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
import sklearn as sl
import seaborn as sns

In [None]:
df = pd.read_csv('train_data.csv')
df

In [None]:
#locating null values
df.loc[:].isna().sum()

In [None]:
#total columns and rows
print("columns = ",len(df.columns))
print("rows = ",len(df.index))
#pretty small data set so we gonna cap the data outliers

In [None]:
#sex and treatment type are not really correlated as p value = 0.8 which is > 0.05 significance level
#Health_Index is highly correlated to treatment as p value <<< 0.05 significance level
#Fat_Level might be somewhat correlated to treatment type as p value is around 0.05. 
        #To prevent underfit we will consider it. 

In [None]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables
encoder = LabelEncoder()
df['Sex'] = encoder.fit_transform(df['Sex'])
df['Health_Index'] = encoder.fit_transform(df['Health_Index'])
df['Fat_Level'] = encoder.fit_transform(df['Fat_Level'])

X = df[['Sex', 'Health_Index', 'Fat_Level']]
y = df['Treatment_Type']

correlation = df[['Age','Mineral_Ratio','Fat_Level','Treatment_Type']].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm')
pl.show()


chi_scores = chi2(X, y)
print(chi_scores)
 

In [None]:
#considering columns will be Mineral_Ratio, Health_Index and Fat_Level for prediction
refinedData = df[['Mineral_Ratio','Health_Index','Treatment_Type']]
IQR = refinedData.describe()
IQR

In [None]:
IQR_Range = IQR.loc['75%','Mineral_Ratio'] - IQR.loc['25%','Mineral_Ratio']
lower_1 = IQR.loc['25%','Mineral_Ratio'] - (1.5*IQR_Range)
upper_1 = IQR.loc['75%','Mineral_Ratio'] + (1.5*IQR_Range)
refinedData = refinedData[(refinedData['Mineral_Ratio'] >= lower_1) & (refinedData['Mineral_Ratio'] <= upper_1)]
refinedData

In [None]:
#normalization of numerical data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
refinedData['Mineral_Ratio'] = scaler.fit_transform(refinedData[['Mineral_Ratio']])
refinedData


In [None]:
from sklearn.cluster import KMeans

X = refinedData[['Mineral_Ratio','Health_Index']]
inertia = []
k_range = range(1, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)  # Inertia: Sum of squared distances to closest cluster center

# Plot the elbow curve
pl.figure(figsize=(10, 6))
pl.plot(k_range, inertia, marker='o')
pl.title('Elbow Method for Optimal k')
pl.xlabel('Number of Clusters (k)')
pl.ylabel('Inertia')
pl.xticks(k_range)
pl.grid()
pl.show()

In [None]:
# Fit K-Means with the chosen number of clusters
k = 4  # Replace with the optimal k you found
kmeans = KMeans(n_clusters=k, random_state=42)
refinedData['Cluster'] = kmeans.fit_predict(X)  # Adds the cluster assignment to your DataFrame

# Check the resulting DataFrame with cluster assignments
print(refinedData.head())

In [None]:
# from sklearn.metrics import accuracy_score

# y_pred_val = best_model.predict(X_val)
# print(f"Validation Accuracy: {accuracy_score(y_val, y_pred_val)}")
from mpl_toolkits.mplot3d import Axes3D
fig = pl.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
ax.scatter(refinedData['Mineral_Ratio'], 
           refinedData['Health_Index'].astype('category').cat.codes,  # Convert categorical to numerical for plotting
           # Convert categorical to numerical for plotting
           c=refinedData['Cluster'], 
           cmap='viridis', 
           marker='o')

# Set labels
ax.set_xlabel('Mineral Ratio (Normalized)')
ax.set_ylabel('Health Index (Encoded)')
ax.set_zlabel('Fat Level (Encoded)')
ax.set_title('K-Means Clustering in 3D Space')

# Show color bar
pl.colorbar(ax.collections[0], label='Cluster')
pl.show()

In [None]:

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, fowlkes_mallows_score



In [None]:

# Apply PCA to reduce dimensions to 2 components for visualization
pca = PCA(n_components=2)
pca_transformed = pca.fit_transform(refinedData.drop(columns=['Treatment_Type']))  # Drop the label column

# Visualize PCA-transformed data
pl.scatter(pca_transformed[:, 0], pca_transformed[:, 1], s=50)
pl.title('PCA - 2D Projection of Data')
pl.show()


In [None]:

# Initialize K-Means with k-means++ to improve cluster initialization
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42)
kmeans.fit(pca_transformed)   

# Predict clusters
refinedData['Cluster'] = kmeans.labels_

# Evaluate the clustering using Silhouette Score (closer to 1 is better)
silhouette_avg = silhouette_score(pca_transformed, kmeans.labels_)
print(f'Silhouette Score: {silhouette_avg}')



In [None]:
# now working on testing data
df_test = pd.read_csv('test_data.csv')

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [None]:
# Manually map 'M' and 'F' in test data to match training encoding
df_test['Sex'] = df_test['Sex'].map({'Male': 0, 'Female': 1})

In [None]:
# Check unique categories in training
known_health_index = ['HIGH', 'MEDIUM', 'LOW']  # Replacing with actual categories seen in training
df_test['Health_Index'] = df_test['Health_Index'].apply(lambda x: x if x in known_health_index else 'LOW')

# Encode Health_Index and Fat_Level by using same encoder as i did in training
encoder = LabelEncoder()
encoder.fit(['HIGH', 'MEDIUM', 'LOW'])  # Assuming these were same labels in training data

df_test['Health_Index'] = encoder.transform(df_test['Health_Index'])

# Handle Fat_Level similarly
known_fat_level = ['HIGH', 'LOW']  # Replacing with actual categories seen in training
df_test['Fat_Level'] = df_test['Fat_Level'].apply(lambda x: x if x in known_fat_level else 'LOW')

encoder.fit(['HIGH', 'LOW'])
df_test['Fat_Level'] = encoder.transform(df_test['Fat_Level'])
