In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier
#from sklearn.multioutput import MultiOutputClassifier
#from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Loading the dataset
data = pd.read_csv("C:/Users/Dell/Desktop/Applied ML/project_eda.csv")
data.info

<bound method DataFrame.info of                                                Indicator              Group  \
0      Took Prescription Medication for Mental Health...  National Estimate   
1      Took Prescription Medication for Mental Health...             By Age   
2      Took Prescription Medication for Mental Health...             By Age   
3      Took Prescription Medication for Mental Health...             By Age   
4      Took Prescription Medication for Mental Health...             By Age   
...                                                  ...                ...   
10399  Needed Counseling or Therapy But Did Not Get I...           By State   
10400  Needed Counseling or Therapy But Did Not Get I...           By State   
10401  Needed Counseling or Therapy But Did Not Get I...           By State   
10402  Needed Counseling or Therapy But Did Not Get I...           By State   
10403  Needed Counseling or Therapy But Did Not Get I...           By State   

               Stat

In [3]:
data.head()

Unnamed: 0,Indicator,Group,State,Subgroup,Phase,Time Period,Time Period Label,Value,LowCI,HighCI,Quartile Range,Usage_Category
0,Took Prescription Medication for Mental Health...,National Estimate,United States,United States,2,13,"Aug 19 - Aug 31, 2020",19.4,19.0,19.8,9.8-11.2,Medium
1,Took Prescription Medication for Mental Health...,By Age,United States,18 - 29 years,2,13,"Aug 19 - Aug 31, 2020",18.7,17.2,20.3,9.8-11.2,Medium
2,Took Prescription Medication for Mental Health...,By Age,United States,30 - 39 years,2,13,"Aug 19 - Aug 31, 2020",18.3,17.3,19.2,9.8-11.2,Medium
3,Took Prescription Medication for Mental Health...,By Age,United States,40 - 49 years,2,13,"Aug 19 - Aug 31, 2020",20.4,19.5,21.3,9.8-11.2,Medium
4,Took Prescription Medication for Mental Health...,By Age,United States,50 - 59 years,2,13,"Aug 19 - Aug 31, 2020",21.2,20.2,22.2,9.8-11.2,Medium


In [4]:
# Identify categorical columns
cat_cols = data.select_dtypes(include=['object', 'category']).columns.to_list()
print(cat_cols)

['Indicator', 'Group', 'State', 'Subgroup', 'Phase', 'Time Period Label', 'Quartile Range', 'Usage_Category']


In [5]:
label_encoder = LabelEncoder()
# Apply label encoding to each categorical column
for col in cat_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Verify the changes
print(data.head())

   Indicator  Group  State  Subgroup  Phase  Time Period  Time Period Label  \
0          3      9     44        70      1           13                  4   
1          3      0     44         0      1           13                  4   
2          3      0     44         1      1           13                  4   
3          3      0     44         2      1           13                  4   
4          3      0     44         3      1           13                  4   

   Value  LowCI  HighCI  Quartile Range  Usage_Category  
0   19.4   19.0    19.8             490               2  
1   18.7   17.2    20.3             490               2  
2   18.3   17.3    19.2             490               2  
3   20.4   19.5    21.3             490               2  
4   21.2   20.2    22.2             490               2  


In [6]:
# Defining the features and target variables
y = data["Usage_Category"]
x = data.drop(columns=["Value", "Usage_Category"])

In [7]:
# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [8]:
# Normalizing the dataset
# Initialize the StandardScaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both training and test data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [14]:
# Applying linear regression to check the linearity of the dataset

# Initialize and train the linear regression model
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)

# Predict on the test set
y_pred = linear_model.predict(x_test)

# Calculate R² score
r2 = r2_score(y_test, y_pred)
print("R² score:", r2)

R² score: 0.37478054194370425


In [10]:
# Applying RF model
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train)

In [11]:
# Make predictions on the test data
y_pred = rf.predict(x_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Classification report (precision, recall, F1-score, etc.)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 99.09%
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       705
           1       0.99      0.99      0.99       656
           2       0.99      0.99      0.99       720

    accuracy                           0.99      2081
   macro avg       0.99      0.99      0.99      2081
weighted avg       0.99      0.99      0.99      2081

Confusion Matrix:
 [[700   0   5]
 [  0 652   4]
 [  5   5 710]]


In [19]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Linear SVM
model_linear = SVC(kernel='linear')
score_linear = cross_val_score(model_linear, x_train_scaled, y_train, cv=5).mean()

# Polynomial SVM (degree 2)
model_poly2 = SVC(kernel='poly', degree=2)
score_poly2 = cross_val_score(model_poly2, x_train_scaled, y_train, cv=5).mean()

# RBF SVM
model_rbf = SVC(kernel='rbf', gamma='scale')
score_rbf = cross_val_score(model_rbf, x_train_scaled, y_train, cv=5).mean()

# Compare results
print("Linear Kernel CV Score:", score_linear)
print("Polynomial Kernel Degree 2 CV Score:", score_poly2)
print("RBF Kernel Gamma as Scale CV Score:", score_rbf)

Linear Kernel CV Score: 0.9699623902748904
Polynomial Kernel Degree 2 CV Score: 0.980295174982675
RBF Kernel Gamma as Scale CV Score: 0.9561446061446063


In [22]:
# For choosing the best param for our model
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('svc', SVC(kernel='poly'))
])

# Define the parameter grid
param_grid = {
    'svc__degree': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'svc__C': [0.1, 1, 10, 100],    
    'svc__gamma': ['scale', 'auto']
}

# Set up GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(x_train_scaled, y_train)

# Get the best parameters and score
best_degree = grid_search.best_params_['svc__degree']
best_score = grid_search.best_score_

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best cross-validated score: {best_score}")

Best parameters: {'svc__C': 100, 'svc__degree': 1, 'svc__gamma': 'scale'}
Best cross-validated score: 0.9942325017325018


In [None]:
# Make predictions
y_train_pred = svm_model.predict(x_train)
y_test_pred = svm_model.predict(x_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Classification report (precision, recall, F1-score, etc.)
print("Classification Report:\n", classification_report(y_test, y_test_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

In [None]:
from sklearn.metrics import mean_squared_error
# Calculate MSE for both training and test sets
svm_train_mse = mean_squared_error(y_train, y_train_pred)
svm_test_mse = mean_squared_error(y_test, y_test_pred)

# Print the MSE values
print(f"Training MSE: {svm_train_mse}")
print(f"Test MSE: {svm_test_mse}")

# Plot the MSE for both training and test sets
mse_values = [svm_train_mse, svm_test_mse]
labels = ['Training MSE', 'Test MSE']

plt.bar(labels, mse_values, color=['blue', 'red'])
plt.ylabel('Mean Squared Error')
plt.title('Training vs Test MSE')
plt.show()

In [None]:
# Applying Logistic regression
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(x_train, y_train)

In [None]:
# Make predictions
y_train_pred = logistic_model.predict(x_train)
y_test_pred = logistic_model.predict(x_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Classification report (precision, recall, F1-score, etc.)
print("Classification Report:\n", classification_report(y_test, y_test_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

In [None]:
# Plot confusion matrix as heatmap
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Low', 'Medium', 'High'],
            yticklabels=['Low', 'Medium', 'High'])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')

In [None]:
from sklearn.metrics import mean_squared_error
# Calculate MSE for both training and test sets
logreg_train_mse = mean_squared_error(y_train, y_train_pred)
logreg_test_mse = mean_squared_error(y_test, y_test_pred)

# Print the MSE values
print(f"Training MSE: {logreg_train_mse}")
print(f"Test MSE: {logreg_test_mse}")

# Plot the MSE for both training and test sets
mse_values = [logreg_train_mse, logreg_test_mse]
labels = ['Training MSE', 'Test MSE']

plt.bar(labels, mse_values, color=['blue', 'red'])
plt.ylabel('Mean Squared Error')
plt.title('Training vs Test MSE')
plt.show()

In [None]:
# Applying ANN
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Build the ANN model
model = keras.Sequential()
model.add(layers.Dense(10, activation='relu', input_shape=(x_train.shape[1],)))  # Input layer
model.add(layers.Dense(10, activation='relu'))  # Hidden layer
model.add(layers.Dense(len(label_encoder.classes_), activation='softmax'))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(x_train, y_train, epochs=100, batch_size=5, verbose=1)