Assigment week 4

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('cleaned_data.csv')

In [None]:
df.head()

In [None]:
!pip install pycaret

In [None]:
from pycaret.classification import *
s=setup(data=df,target='obesity_level')

In [None]:
s=setup(data=df, target='obesity_level',
        numeric_features=['Age','Weight','Height'],
        session_id=123,
        normalize=True,
        feature_selection=True,
        polynomial_features=True,
        experiment_name='obesity_prediction',
        remove_multicollinearity=True,
        multicollinearity_threshold=0.95)

In [None]:
best=compare_models(n_select=1)

In [None]:
top_models = compare_models(
    include=['gbc', 'lightgbm', 'rf', 'xgboost', 'et'],   # Include specific models
    fold=5,                            # Number of folds for cross-validation
    n_select=3,                        # Number of top models to select
    verbose=True,                      # Print detailed output
    budget_time=600,                   # Maximum time for model selection in seconds
    sort='precision'
)

In [None]:
top_models

In [None]:
# Blend top models
blender = blend_models(estimator_list=top_models, fold=5)

# Finalize the best model
final_model = finalize_model(blender)

# Print the best model
print(final_model)

In [None]:
final_model

In [None]:
train_df = get_config('X_train')
test_df = get_config('X_test')

In [None]:
test_df.head()

In [None]:
predict_model(best)

In [None]:
df.head()

In [None]:
predictions=predict_model(final_model, data=test_df)
predictions.head()

In [None]:
# Save the DataFrame to a new CSV file
predictions.to_csv('Infosys_week_3_4.csv')


from google.colab import files
files.download('Infosys_week_3_4.csv')

In [None]:
get_metrics()

In [None]:
#Default ROC curve obtained for the first model (GBC) in the top_models list
plot_model(top_models[0])

In [None]:
#Confusion matrix obtained for the second model (Random Forest Classifier) in the top_models list
plot_model(top_models[1], plot='confusion_matrix')

In [None]:
#Feature plot obtained for the third model (Light GBM) in the top_models list

plot_model(top_models[2], plot='feature')

In [None]:
#Error analysis of the best model (Gradient Boosting Classifier) in the top_models list
plot_model(top_models[0], plot='error')

In [None]:
plot_model(top_models[0], plot='class_report')

In [None]:
plot_model(final_model, plot='pr')

##Loading Test Data

In [None]:
test_modified = df.iloc[:, :-1]

In [None]:
test_modified.head()

In [None]:
test_modified.shape

In [None]:
test_modified[test_modified.isna().any(axis=1) | (test_modified== ' ').any(axis=1)]

In [None]:
test_modified.dropna(inplace=True)

In [None]:
test_modified[test_modified.isna().any(axis=1) | (test_modified== ' ').any(axis=1)]

In [None]:
test_modified.duplicated().sum()

test_modified=test_modified.drop_duplicates()

##Prediction for test data

In [None]:
test_predictions=predict_model(final_model, data=test_modified)

In [None]:
test_predictions

In [None]:
#prediction_label

from matplotlib import pyplot as plt
import seaborn as sns
test_predictions.groupby('prediction_label').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
from pycaret.classification import save_model

save_model(final_model, 'final_model')


In [None]:
import matplotlib.pyplot as plt
# Distribution of predicted obesity levels
plt.figure(figsize=(8, 6))
sns.countplot(x='prediction_label', data=test_predictions)
plt.title('Distribution of Predicted Obesity Levels')
plt.xlabel('Obesity Level')
plt.ylabel('Count')
plt.show()

# Bar plot of mean age by predicted obesity level
plt.figure(figsize=(8, 6))
sns.barplot(x='prediction_label', y='Age', data=test_predictions)
plt.title('Mean Age by Predicted Obesity Level')
plt.xlabel('Obesity Level')
plt.ylabel('Age')
plt.show()

# Boxplot of weight distribution by predicted obesity level
plt.figure(figsize=(8, 6))
sns.boxplot(x='prediction_label', y='Weight', data=test_predictions)
plt.title('Weight Distribution by Predicted Obesity Level')
plt.xlabel('Obesity Level')
plt.ylabel('Weight')
plt.show()

# Scatter plot of height vs. weight, colored by predicted obesity level
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Height', y='Weight', hue='prediction_label', data=test_predictions)
plt.title('Height vs. Weight by Predicted Obesity Level')
plt.xlabel('Height')
plt.ylabel('Weight')
plt.show()
