In [8]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
trainData = pd.read_csv('train.csv')
testData = pd.read_csv('test_without_labels.csv')
x_train = trainData['Title']+" "+ trainData['Content']
y_train = trainData['Label']
x_test = testData['Title']+" "+ testData['Content']

In [10]:
#Bag of word convertion
vectorizer = CountVectorizer(max_features=1000)
X_train_bow = vectorizer.fit_transform(x_train)
X_test_bow = vectorizer.transform(x_test)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_bow, y_train, test_size=0.2, random_state=42
)

In [11]:
#SVM
svm_model = LinearSVC(random_state=42,max_iter=2000)
svm_model.fit(X_train_split, y_train_split)
y_test_pred = svm_model.predict(X_test_bow)
output = pd.DataFrame({
    'Id': range(1, len(y_test_pred) + 1),
    'Predicted': y_test_pred
})
print(output)
accuracy = cross_val_score(svm_model, X_val_split, y_val_split, cv=5, scoring='accuracy')
mean_accuracy_percent = accuracy.mean()
mean_accuracy_percent = mean_accuracy_percent.item()*100
print(f"5-Fold Cross Validation Accuracy SVM: {mean_accuracy_percent:.2f}%")

          Id      Predicted
0          1  Entertainment
1          2       Business
2          3  Entertainment
3          4  Entertainment
4          5  Entertainment
...      ...            ...
47907  47908     Technology
47908  47909  Entertainment
47909  47910     Technology
47910  47911       Business
47911  47912     Technology

[47912 rows x 2 columns]
5-Fold Cross Validation Accuracy SVM: 88.73%


In [12]:
#RandomForest
RFModel = RandomForestClassifier(n_estimators=50 ,max_features='sqrt',n_jobs=-1, random_state=42)
RFModel.fit(X_train_split, y_train_split)
y_test_pred_RFModel = RFModel.predict(X_test_bow)
output = pd.DataFrame({
    'Id': range(1, len(y_test_pred_RFModel) + 1),
    'Predicted': y_test_pred_RFModel
})
print(output)
accuracy = cross_val_score(RFModel, X_val_split, y_val_split, cv=5, scoring='accuracy')
mean_accuracy_percent = accuracy.mean()
mean_accuracy_percent = mean_accuracy_percent.item()*100
print(f"5-Fold Cross Validation Accuracy Random Forest: {mean_accuracy_percent:.2f}%")

          Id      Predicted
0          1  Entertainment
1          2       Business
2          3  Entertainment
3          4  Entertainment
4          5  Entertainment
...      ...            ...
47907  47908     Technology
47908  47909  Entertainment
47909  47910     Technology
47910  47911     Technology
47911  47912     Technology

[47912 rows x 2 columns]
5-Fold Cross Validation Accuracy Random Forest: 88.37%
