In [4]:
from google.colab import files
 
 
uploaded = files.upload()

Saving train.csv to train (1).csv


In [6]:
from google.colab import files
 
 
uploaded = files.upload()

Saving test.csv to test (1).csv


In [56]:
import pandas as pd
import io
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


class TextClassifier:
    def __init__(self, train_path, test_path):
        self.train_path = train_path
        self.test_path = test_path
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.tfidf = TfidfVectorizer(lowercase=True, analyzer='word', ngram_range=(1, 2))
        self.clf = LogisticRegression(random_state=0, C=100, solver='liblinear', max_iter=500, n_jobs=-1, tol=0.0001)

    def load_data(self):
        train_df = pd.read_csv(self.train_path)
        test_df = pd.read_csv(self.test_path)
        return train_df, test_df

    def preprocess_text(self, text):
        tokens = word_tokenize(text)
        filtered_tokens = [self.stemmer.stem(token.lower()) for token in tokens if token.lower() not in self.stop_words]
        return ' '.join(filtered_tokens)

    def preprocess_data(self, df):
        df['TEXT'] = df['TEXT'].astype(str)
        df['TEXT'] = df['TEXT'].apply(self.preprocess_text)
        return df

    def train(self, train_df):
        X_train, X_val, y_train, y_val = train_test_split(train_df['TEXT'], train_df['LABEL'], test_size=0.3, random_state=0)
        X_train_tfidf = self.tfidf.fit_transform(X_train)
        X_val_tfidf = self.tfidf.transform(X_val)
        self.clf.fit(X_train_tfidf, y_train)
        accuracy = accuracy_score(y_val, self.clf.predict(X_val_tfidf))
        print('Accuracy: {:.2f}%'.format(accuracy * 100))
        cm = confusion_matrix(y_val, self.clf.predict(X_val_tfidf))
        print('Confusion Matrix: \n', cm)
        print(classification_report(y_val, self.clf.predict(X_val_tfidf)))

    def predict(self, test_df):
        test_df['TEXT'] = test_df['TEXT'].astype(str)
        X_test_tfidf = self.tfidf.transform(test_df['TEXT'])
        predictions = self.clf.predict(X_test_tfidf)
        return predictions

    def save_predictions(self, test_df, predictions, output_file):
        df_final = pd.DataFrame()
        df_final['ID'] = test_df['ID']
        df_final['LABEL'] = predictions
        df_final.to_csv(output_file, index=False)


if __name__ == '__main__':
    train_path = 'train.csv'
    test_path = 'test.csv'
    output_file = 'predictions.csv'
    classifier = TextClassifier(train_path, test_path)
    train_df, test_df = classifier.load_data()
    train_df = classifier.preprocess_data(train_df)
    test_df = classifier.preprocess_data(test_df)
    classifier.train(train_df)
    predictions = classifier.predict(test_df)
    classifier.save_predictions(test_df, predictions, output_file)




Accuracy: 92.51%
Confusion Matrix: 
 [[9355  115   60]
 [ 202 5158  475]
 [  92  636 5003]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      9530
           1       0.87      0.88      0.88      5835
           2       0.90      0.87      0.89      5731

    accuracy                           0.93     21096
   macro avg       0.92      0.91      0.91     21096
weighted avg       0.92      0.93      0.92     21096

