In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load the training and test data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
#cleaning train data by Lemmatizing and stopwords separation
cleanedData = []

lemma = WordNetLemmatizer()
swords = stopwords.words("english")
for text in train_df["Text"]:
    text = nltk.word_tokenize(text.lower())
    text = [lemma.lemmatize(word) for word in text]
    
    # Removing stopwords
    text = [word for word in text if word not in swords]
    
    # Joining
    text = " ".join(text)
    
    cleanedData.append(text)

In [4]:
#cleaning train data by Lemmatizing and stopwords separation
cleanedData1 = []

lemma1 = WordNetLemmatizer()
swords1 = stopwords.words("english")
for text in test_df["Text"]:
    text = nltk.word_tokenize(text.lower())
    text = [lemma1.lemmatize(word) for word in text]
    
    # Removing stopwords
    text = [word for word in text if word not in swords1]
    
    # Joining
    text = " ".join(text)
    
    cleanedData1.append(text)

In [5]:
#creating cleaned dataframes
train = pd.DataFrame({'Text': cleanedData})
test = pd.DataFrame({'Text': cleanedData1})

# Save the dataframes to CSV files
train.to_csv('train_clean.csv', index=False)
test.to_csv('test_clean.csv', index=False)

train_clean=pd.read_csv('train_clean.csv')
test_clean=pd.read_csv('test_clean.csv')

In [6]:
#fetching train values
X_train = train_clean['Text']
y_train = train_df['label']

In [7]:
#vectorization by TFI-DF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [8]:

# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Transform the test text data into TF-IDF features
X_test = test_clean['Text']
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Predict labels for the test data
y_pred = classifier.predict(X_test_tfidf)

# Add the predicted labels to the test dataframe
test_df['predicted_label'] = y_pred

# Save the test dataframe with predictions to a CSV file
test_df.to_csv('test_with_predictions.csv', index=False)

# Display the classification report for evaluation
print(classification_report(y_train, classifier.predict(X_train_tfidf)))


                                        precision    recall  f1-score   support

Anger/ Intermittent Explosive Disorder       0.86      0.83      0.84       154
                      Anxiety Disorder       0.84      0.90      0.86       153
                            Depression       0.77      0.88      0.82       208
                 Narcissistic Disorder       0.99      0.99      0.99       158
                        Panic Disorder       0.99      0.67      0.80       112

                              accuracy                           0.87       785
                             macro avg       0.89      0.85      0.86       785
                          weighted avg       0.88      0.87      0.87       785



In [9]:

from sklearn.tree import DecisionTreeClassifier
# Train a Decision Tree classifier
classifier1 = DecisionTreeClassifier(max_depth=12, criterion='entropy', random_state=1)
classifier1.fit(X_train_tfidf, y_train)

# Transform the test text data into TF-IDF features
X_test = test_clean['Text']
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Predict labels for the test data
y_pred = classifier1.predict(X_test_tfidf)

# Add the predicted labels to the test dataframe
test_df['predicted_label'] = y_pred

# Save the test dataframe with predictions to a CSV file
test_df.to_csv('test_with_predictions_SVM.csv', index=False)

# Display the classification report for evaluation
print(classification_report(y_train, classifier1.predict(X_train_tfidf)))


                                        precision    recall  f1-score   support

Anger/ Intermittent Explosive Disorder       0.80      0.81      0.81       154
                      Anxiety Disorder       0.90      0.66      0.76       153
                            Depression       0.42      0.81      0.55       208
                 Narcissistic Disorder       0.97      0.35      0.52       158
                        Panic Disorder       0.96      0.49      0.65       112

                              accuracy                           0.64       785
                             macro avg       0.81      0.63      0.66       785
                          weighted avg       0.78      0.64      0.65       785



In [10]:
from sklearn import svm
# Train a Support vector classifier
classifier = svm.SVC(kernel='sigmoid')
classifier.fit(X_train_tfidf, y_train)

# Transform the test text data into TF-IDF features
X_test = test_clean['Text']
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Predict labels for the test data
y_pred = classifier.predict(X_test_tfidf)

# Add the predicted labels to the test dataframe
test_df['predicted_label'] = y_pred

# Save the test dataframe with predictions to a CSV file
test_df.to_csv('test_with_predictions_SVM_poly.csv', index=False)

# Display the classification report for evaluation
print(classification_report(y_train, classifier.predict(X_train_tfidf)))

                                        precision    recall  f1-score   support

Anger/ Intermittent Explosive Disorder       0.82      0.88      0.85       154
                      Anxiety Disorder       0.85      0.91      0.88       153
                            Depression       0.82      0.81      0.82       208
                 Narcissistic Disorder       0.98      0.99      0.99       158
                        Panic Disorder       0.95      0.77      0.85       112

                              accuracy                           0.87       785
                             macro avg       0.88      0.87      0.88       785
                          weighted avg       0.88      0.87      0.87       785



In [11]:
from sklearn.neural_network import MLPClassifier
model=MLPClassifier(hidden_layer_sizes=(6,5),random_state=42,verbose=True,learning_rate_init=0.001,max_iter=2000)
X_test = test_clean['Text']
X_test_tfidf = tfidf_vectorizer.transform(X_test)
model.fit(X_train_tfidf, y_train)
# Predict labels for the test data
y_pred = model.predict(X_test_tfidf)

# Add the predicted labels to the test dataframe
test_df['predicted_label'] = y_pred

# Save the test dataframe with predictions to a CSV file
test_df.to_csv('test_with_predictions_SVM_MLP.csv', index=False)

# Display the classification report for evaluation
print(classification_report(y_train, classifier.predict(X_train_tfidf)))

Iteration 1, loss = 1.72012548
Iteration 2, loss = 1.71239558
Iteration 3, loss = 1.70542864
Iteration 4, loss = 1.69820639
Iteration 5, loss = 1.69134112
Iteration 6, loss = 1.68407084
Iteration 7, loss = 1.67700176
Iteration 8, loss = 1.66994829
Iteration 9, loss = 1.66278433
Iteration 10, loss = 1.65540043
Iteration 11, loss = 1.64797000
Iteration 12, loss = 1.64021620
Iteration 13, loss = 1.63221594
Iteration 14, loss = 1.62365589
Iteration 15, loss = 1.61464208
Iteration 16, loss = 1.60520247
Iteration 17, loss = 1.59454958
Iteration 18, loss = 1.58380790
Iteration 19, loss = 1.57248108
Iteration 20, loss = 1.56048178
Iteration 21, loss = 1.54846872
Iteration 22, loss = 1.53629111
Iteration 23, loss = 1.52412492
Iteration 24, loss = 1.51135950
Iteration 25, loss = 1.49861728
Iteration 26, loss = 1.48625882
Iteration 27, loss = 1.47362219
Iteration 28, loss = 1.46103127
Iteration 29, loss = 1.44865785
Iteration 30, loss = 1.43624429
Iteration 31, loss = 1.42390528
Iteration 32, los

Iteration 258, loss = 0.39833358
Iteration 259, loss = 0.39613831
Iteration 260, loss = 0.39396964
Iteration 261, loss = 0.39174497
Iteration 262, loss = 0.38956392
Iteration 263, loss = 0.38741254
Iteration 264, loss = 0.38532999
Iteration 265, loss = 0.38308768
Iteration 266, loss = 0.38099425
Iteration 267, loss = 0.37895912
Iteration 268, loss = 0.37677472
Iteration 269, loss = 0.37468852
Iteration 270, loss = 0.37261581
Iteration 271, loss = 0.37057825
Iteration 272, loss = 0.36853726
Iteration 273, loss = 0.36650577
Iteration 274, loss = 0.36449974
Iteration 275, loss = 0.36249665
Iteration 276, loss = 0.36055975
Iteration 277, loss = 0.35857412
Iteration 278, loss = 0.35656090
Iteration 279, loss = 0.35462879
Iteration 280, loss = 0.35274317
Iteration 281, loss = 0.35082669
Iteration 282, loss = 0.34891084
Iteration 283, loss = 0.34705433
Iteration 284, loss = 0.34515629
Iteration 285, loss = 0.34335360
Iteration 286, loss = 0.34145071
Iteration 287, loss = 0.33960298
Iteration 

Iteration 507, loss = 0.14797662
Iteration 508, loss = 0.14758470
Iteration 509, loss = 0.14723219
Iteration 510, loss = 0.14684876
Iteration 511, loss = 0.14650456
Iteration 512, loss = 0.14617876
Iteration 513, loss = 0.14583217
Iteration 514, loss = 0.14545735
Iteration 515, loss = 0.14509802
Iteration 516, loss = 0.14476706
Iteration 517, loss = 0.14444385
Iteration 518, loss = 0.14410957
Iteration 519, loss = 0.14372196
Iteration 520, loss = 0.14339705
Iteration 521, loss = 0.14305593
Iteration 522, loss = 0.14272899
Iteration 523, loss = 0.14237368
Iteration 524, loss = 0.14207238
Iteration 525, loss = 0.14171632
Iteration 526, loss = 0.14139786
Iteration 527, loss = 0.14105582
Iteration 528, loss = 0.14075303
Iteration 529, loss = 0.14041998
Iteration 530, loss = 0.14010880
Iteration 531, loss = 0.13979766
Iteration 532, loss = 0.13946786
Iteration 533, loss = 0.13918021
Iteration 534, loss = 0.13884010
Iteration 535, loss = 0.13850610
Iteration 536, loss = 0.13821613
Iteration 

Iteration 764, loss = 0.09653814
Iteration 765, loss = 0.09642080
Iteration 766, loss = 0.09630638
Iteration 767, loss = 0.09624679
Iteration 768, loss = 0.09611531
Iteration 769, loss = 0.09601153
Iteration 770, loss = 0.09589764
Iteration 771, loss = 0.09577394
Iteration 772, loss = 0.09569060
Iteration 773, loss = 0.09560369
Iteration 774, loss = 0.09548503
Iteration 775, loss = 0.09541294
Iteration 776, loss = 0.09527806
Iteration 777, loss = 0.09522775
Iteration 778, loss = 0.09512270
Iteration 779, loss = 0.09502670
Iteration 780, loss = 0.09491429
Iteration 781, loss = 0.09479740
Iteration 782, loss = 0.09469524
Iteration 783, loss = 0.09461173
Iteration 784, loss = 0.09451829
Iteration 785, loss = 0.09444239
Iteration 786, loss = 0.09432770
Iteration 787, loss = 0.09421011
Iteration 788, loss = 0.09409287
Iteration 789, loss = 0.09408105
Iteration 790, loss = 0.09398130
Iteration 791, loss = 0.09380031
Iteration 792, loss = 0.09376574
Iteration 793, loss = 0.09364716
Iteration 

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
model1=DecisionTreeClassifier(max_depth=6,criterion='entropy',random_state=42)
X_test = test_clean['Text']
X_test_tfidf = tfidf_vectorizer.transform(X_test)
model.fit(X_train_tfidf, y_train)
# Predict labels for the test data
y_pred = model.predict(X_test_tfidf)

# Add the predicted labels to the test dataframe
test_df['predicted_label'] = y_pred

# Save the test dataframe with predictions to a CSV file
test_df.to_csv('test_with_predictions_DecisionTree.csv', index=False)

# Display the classification report for evaluation
print(classification_report(y_train, classifier.predict(X_train_tfidf)))
print(confusion_matrix(y_train, classifier.predict(X_train_tfidf)))

Iteration 1, loss = 1.72012548
Iteration 2, loss = 1.71239558
Iteration 3, loss = 1.70542864
Iteration 4, loss = 1.69820639
Iteration 5, loss = 1.69134112
Iteration 6, loss = 1.68407084
Iteration 7, loss = 1.67700176
Iteration 8, loss = 1.66994829
Iteration 9, loss = 1.66278433
Iteration 10, loss = 1.65540043
Iteration 11, loss = 1.64797000
Iteration 12, loss = 1.64021620
Iteration 13, loss = 1.63221594
Iteration 14, loss = 1.62365589
Iteration 15, loss = 1.61464208
Iteration 16, loss = 1.60520247
Iteration 17, loss = 1.59454958
Iteration 18, loss = 1.58380790
Iteration 19, loss = 1.57248108
Iteration 20, loss = 1.56048178
Iteration 21, loss = 1.54846872
Iteration 22, loss = 1.53629111
Iteration 23, loss = 1.52412492
Iteration 24, loss = 1.51135950
Iteration 25, loss = 1.49861728
Iteration 26, loss = 1.48625882
Iteration 27, loss = 1.47362219
Iteration 28, loss = 1.46103127
Iteration 29, loss = 1.44865785
Iteration 30, loss = 1.43624429
Iteration 31, loss = 1.42390528
Iteration 32, los

Iteration 319, loss = 0.28806935
Iteration 320, loss = 0.28673068
Iteration 321, loss = 0.28529252
Iteration 322, loss = 0.28390086
Iteration 323, loss = 0.28251359
Iteration 324, loss = 0.28119789
Iteration 325, loss = 0.27981137
Iteration 326, loss = 0.27849348
Iteration 327, loss = 0.27716588
Iteration 328, loss = 0.27594893
Iteration 329, loss = 0.27451756
Iteration 330, loss = 0.27327453
Iteration 331, loss = 0.27199514
Iteration 332, loss = 0.27073217
Iteration 333, loss = 0.26943071
Iteration 334, loss = 0.26818771
Iteration 335, loss = 0.26694736
Iteration 336, loss = 0.26571575
Iteration 337, loss = 0.26448789
Iteration 338, loss = 0.26327130
Iteration 339, loss = 0.26207723
Iteration 340, loss = 0.26088030
Iteration 341, loss = 0.25967533
Iteration 342, loss = 0.25856907
Iteration 343, loss = 0.25735520
Iteration 344, loss = 0.25621660
Iteration 345, loss = 0.25509273
Iteration 346, loss = 0.25393500
Iteration 347, loss = 0.25279953
Iteration 348, loss = 0.25171354
Iteration 

Iteration 590, loss = 0.12380671
Iteration 591, loss = 0.12359241
Iteration 592, loss = 0.12336852
Iteration 593, loss = 0.12314058
Iteration 594, loss = 0.12288600
Iteration 595, loss = 0.12266574
Iteration 596, loss = 0.12244835
Iteration 597, loss = 0.12220836
Iteration 598, loss = 0.12202709
Iteration 599, loss = 0.12177634
Iteration 600, loss = 0.12160591
Iteration 601, loss = 0.12135986
Iteration 602, loss = 0.12116189
Iteration 603, loss = 0.12095408
Iteration 604, loss = 0.12070049
Iteration 605, loss = 0.12049527
Iteration 606, loss = 0.12029695
Iteration 607, loss = 0.12010073
Iteration 608, loss = 0.11984298
Iteration 609, loss = 0.11965592
Iteration 610, loss = 0.11944202
Iteration 611, loss = 0.11923125
Iteration 612, loss = 0.11903079
Iteration 613, loss = 0.11882680
Iteration 614, loss = 0.11861495
Iteration 615, loss = 0.11841707
Iteration 616, loss = 0.11822045
Iteration 617, loss = 0.11804470
Iteration 618, loss = 0.11782378
Iteration 619, loss = 0.11761522
Iteration 

Iteration 846, loss = 0.08915835
Iteration 847, loss = 0.08908414
Iteration 848, loss = 0.08899534
Iteration 849, loss = 0.08892856
Iteration 850, loss = 0.08888418
Iteration 851, loss = 0.08885887
Iteration 852, loss = 0.08870976
Iteration 853, loss = 0.08860218
Iteration 854, loss = 0.08852805
Iteration 855, loss = 0.08848001
Iteration 856, loss = 0.08839284
Iteration 857, loss = 0.08833039
Iteration 858, loss = 0.08824551
Iteration 859, loss = 0.08821989
Iteration 860, loss = 0.08816277
Iteration 861, loss = 0.08804056
Iteration 862, loss = 0.08796057
Iteration 863, loss = 0.08789044
Iteration 864, loss = 0.08783165
Iteration 865, loss = 0.08780684
Iteration 866, loss = 0.08764931
Iteration 867, loss = 0.08760863
Iteration 868, loss = 0.08755897
Iteration 869, loss = 0.08746484
Iteration 870, loss = 0.08739009
Iteration 871, loss = 0.08733625
Iteration 872, loss = 0.08724707
Iteration 873, loss = 0.08720395
Iteration 874, loss = 0.08713522
Iteration 875, loss = 0.08707617
Iteration 

In [13]:
!pip install xgboost


Defaulting to user installation because normal site-packages is not writeable


In [14]:
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# Assuming you have already imported other necessary libraries and have your data ready.

# Create an XGBoost classifier
model2 = xgb.XGBClassifier(random_state=42)

# Fit the XGBoost classifier to your training data
model2.fit(X_train_tfidf, y_train)

# Predict labels for the test data
y_pred_xgb = model2.predict(X_test_tfidf)

# Add the predicted labels to the test dataframe
test_df['predicted_label'] = y_pred_xgb

# Save the test dataframe with predictions to a CSV file
test_df.to_csv('test_with_predictions_XGBoost.csv', index=False)

# Display the classification report for evaluation
print(classification_report(y_train, model2.predict(X_train_tfidf)))
print(confusion_matrix(y_train, model2.predict(X_train_tfidf)))


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4], got ['Anger/ Intermittent Explosive Disorder' 'Anxiety Disorder' 'Depression'
 'Narcissistic Disorder' 'Panic Disorder']