# Imports

In [132]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Data 

In [133]:
df= pd.read_csv('train.csv')

In [134]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [135]:
df.shape

(20800, 5)

In [136]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [137]:
df.fillna('', inplace=True)

In [138]:
df.shape

(20800, 5)

In [139]:
df['content'] = df['title'] + ' ' + df['author'] + ' ' + df['text']

In [140]:
df.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


In [141]:
X = df['content']
y = df['label']

In [142]:
vectorizer = TfidfVectorizer(max_features=6000, stop_words='english')
X_tfidf = vectorizer.fit_transform(X)

In [143]:
# 7. Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=4)

In [144]:
# 8. Train Logistic Regression Model
model = LogisticRegression(penalty='l2', C=1)
model.fit(X_train, y_train)

# 9. Make Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# 10. Evaluate Model
print("Train accuracy:", accuracy_score(y_train, y_train_pred))
print("Test accuracy:", accuracy_score(y_test, y_test_pred))

print("Classification Report:\n", classification_report(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

Train accuracy: 0.9772836538461539
Test accuracy: 0.9560096153846154
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96      2069
           1       0.95      0.96      0.96      2091

    accuracy                           0.96      4160
   macro avg       0.96      0.96      0.96      4160
weighted avg       0.96      0.96      0.96      4160

Confusion Matrix:
 [[1963  106]
 [  77 2014]]


# Predicting Function

In [145]:
def preprocess(df, vectorizer):
    df.fillna('', inplace=True)
    df['content'] = df['title'] + ' ' + df['author'] + ' ' + df['text']
    X = df['content']
    X = vectorizer.transform(X)  # Use transform, not fit_transform
    return X

# Step 3: Test the model on new data
test_data = pd.read_csv('test.csv')
processed_data = preprocess(test_data, vectorizer)

def news_test(data, df):
    predictions = model.predict(data)
    output_df = pd.DataFrame({
        'id': df['id'],
        'label': predictions
    })
    return output_df

# Generate predictions and save to CSV
result = news_test(processed_data, test_data)
# result.to_csv("predictions.csv", index=False) ## If you want to save as csv

In [146]:
result

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,1
3,20803,0
4,20804,1
...,...,...
5195,25995,1
5196,25996,0
5197,25997,0
5198,25998,1
