In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


import chardet

file_path = '/content/corpusdata.csv'

# Detect the file encoding
with open(file_path, 'rb') as f:
    result = chardet.detect(f.read())
    encoding = result['encoding']

print(f"Detected encoding: {encoding}")

# Read the file with the detected encoding
try:
    data = pd.read_csv(file_path, encoding=encoding)
    print(data.head())
except Exception as e:
    print(f"Error reading with detected encoding '{encoding}': {e}")


# Display the first few rows of the dataset
print(data.head())

# Check for null values
print("Null values in each column:\n", data.isnull().sum())

# Assuming the CSV file has two columns: 'text' and 'label'
# If your file has different column names, adjust accordingly
text_data = data['text']
labels = data['label']

# Preprocessing the text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and non-alphanumeric characters
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

text_data = text_data.apply(preprocess_text)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(text_data).toarray()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train an SVM classifier
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)

# Predict on the test set
y_pred = svm.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)


Detected encoding: ISO-8859-1
                                                text        label
0   Stuning even for the non-gamer: This sound tr...  __label__2 
1   The best soundtrack ever to anything.: I'm re...  __label__2 
2   Amazing!: This soundtrack is my favorite musi...  __label__2 
3   Excellent Soundtrack: I truly like this sound...  __label__2 
4   Remember, Pull Your Jaw Off The Floor After H...  __label__2 
                                                text        label
0   Stuning even for the non-gamer: This sound tr...  __label__2 
1   The best soundtrack ever to anything.: I'm re...  __label__2 
2   Amazing!: This soundtrack is my favorite musi...  __label__2 
3   Excellent Soundtrack: I truly like this sound...  __label__2 
4   Remember, Pull Your Jaw Off The Floor After H...  __label__2 
Null values in each column:
 text     0
label    0
dtype: int64
Accuracy: 0.8625
Classification Report:
               precision    recall  f1-score   support

 __label__1       