In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = 'Extended_Dataset-Mental-Disorders-5261.csv'  # Update this path if necessary
data = pd.read_csv("Unique_Extended_Dataset-Mental-Disorders-5261.csv")

# Preview the dataset
print("Dataset preview:")
print(data.head())

# Print column names to identify the target column
print("\nColumn names in the dataset:")
print(data.columns)

# Data preprocessing
# Use 'Expert Diagnose' as the target column
target_column = 'Expert Diagnose'
if target_column not in data.columns:
    raise KeyError(f"The specified target column '{target_column}' is not found in the dataset.")

X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Target

# Handle missing values (if any)
X = X.fillna(X.select_dtypes(include=['number']).mean(numeric_only=True))

# Convert categorical variables to numeric (if necessary)
X = pd.get_dummies(X, drop_first=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an SVM model
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print(f"\nThe model achieved an accuracy of {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance approximation (weights from linear kernel)
if hasattr(svm_model, 'coef_'):
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': abs(svm_model.coef_[0])
    }).sort_values(by='Importance', ascending=False)

    print("\nFeature Importances (approximated from linear SVM):")
    print(feature_importances)
else:
    print("\nFeature importance is not available for non-linear kernels.")


Dataset preview:
  Patient Number    Sadness    Euphoric  Exhausted Sleep dissorder Mood Swing  \
0     Patiant-01    Usually      Seldom  Sometimes       Sometimes        YES   
1     Patiant-02    Usually      Seldom    Usually       Sometimes         NO   
2     Patiant-03  Sometimes  Most-Often  Sometimes       Sometimes        YES   
3     Patiant-04    Usually      Seldom    Usually      Most-Often        YES   
4     Patiant-05    Usually     Usually  Sometimes       Sometimes         NO   

  Suicidal thoughts Anorxia Authority Respect Try-Explanation  \
0              YES       NO                NO             YES   
1               YES      NO                NO              NO   
2                NO      NO                NO             YES   
3               YES     YES                NO             YES   
4                NO      NO                NO              NO   

  Aggressive Response Ignore & Move-On Nervous Break-down Admit Mistakes  \
0                  NO        

In [6]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print dataset sizes
print(f"\nTraining set size: {X_train.shape}, {y_train.shape}")
print(f"Testing set size: {X_test.shape}, {y_test.shape}")

# Create an SVM model
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Ensure predictions are generated
print(f"\nPredictions: {y_pred[:5]}")  # Print first 5 predictions for verification

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print(f"\nThe model achieved an accuracy of {accuracy * 100:.2f}%")



Training set size: (4208, 5309), (4208,)
Testing set size: (1053, 5309), (1053,)

Predictions: ['Normal' 'Bipolar Type-2' 'Depression' 'Normal' 'Normal']

Accuracy: 0.2849002849002849

The model achieved an accuracy of 28.49%


In [7]:
print("\nUnique classes in the target:", y.unique())
print("\nFirst 5 predictions:", y_pred[:5])
print("\nFirst 5 actual values:", y_test[:5].values)



Unique classes in the target: ['Bipolar Type-2' 'Depression' 'Bipolar Type-1' 'Normal']

First 5 predictions: ['Normal' 'Bipolar Type-2' 'Depression' 'Normal' 'Normal']

First 5 actual values: ['Bipolar Type-1' 'Normal' 'Depression' 'Bipolar Type-2' 'Normal']


In [8]:
from sklearn.decomposition import PCA

# Apply PCA to reduce dimensionality
pca = PCA(n_components=50, random_state=42)  # Keep 50 principal components
X_reduced = pca.fit_transform(X)

print(f"\nReduced feature set shape: {X_reduced.shape}")



Reduced feature set shape: (5261, 50)


In [9]:
print("\nAccuracy:", accuracy)
print(f"\nThe model achieved an accuracy of {accuracy * 100:.2f}%")



Accuracy: 0.2849002849002849

The model achieved an accuracy of 28.49%


In [10]:
svm_model = SVC(kernel='linear', random_state=42, decision_function_shape='ovr')  # One-vs-Rest strategy


In [11]:
# Apply PCA to reduce dimensionality
pca = PCA(n_components=50, random_state=42)  # Reduce to 50 features
X_reduced = pca.fit_transform(X)
print(f"\nReduced feature set shape: {X_reduced.shape}")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Verify training and testing sizes
print(f"\nTraining set size: {X_train.shape}, {y_train.shape}")
print(f"Testing set size: {X_test.shape}, {y_test.shape}")

# Create an SVM model
svm_model = SVC(kernel='linear', random_state=42, decision_function_shape='ovr')

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Verify predictions
print("\nUnique classes in the target:", y.unique())
print("\nFirst 5 predictions:", y_pred[:5])
print("\nFirst 5 actual values:", y_test[:5].values)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print(f"\nThe model achieved an accuracy of {accuracy * 100:.2f}%")



Reduced feature set shape: (5261, 50)

Training set size: (4208, 50), (4208,)
Testing set size: (1053, 50), (1053,)

Unique classes in the target: ['Bipolar Type-2' 'Depression' 'Bipolar Type-1' 'Normal']

First 5 predictions: ['Normal' 'Bipolar Type-2' 'Depression' 'Normal' 'Normal']

First 5 actual values: ['Bipolar Type-1' 'Normal' 'Depression' 'Bipolar Type-2' 'Normal']

Accuracy: 0.288698955365622

The model achieved an accuracy of 28.87%
