In [11]:
import os
import sys

# Go one step up from the current working directory
parent_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(parent_path)  # Change working directory
sys.path.append(parent_path)  # Add to Python path if needed

print("Now in:", os.getcwd())

Now in: C:\Users\rohit\OneDrive\Desktop\Projects_main\News


In [12]:
import yaml
from src.data import load_data, process_training_data
from src.utils import clean_text, text_length, remove_outranged_length_articles, remove_duplicates, save_model, save_data
from src.models import LogisticRegression
from src import model_prediction
import pandas as pd
import os

In [13]:
with open('Configs/configs.yaml','r') as f:
    data_configs = yaml.safe_load(f)


In [14]:
true_data = load_data.fetch_data(data_configs['true_data'])
fake_data = load_data.fetch_data(data_configs['fake_data'])

Loading data from Data/True.csv
Loading data from Data/Fake.csv


In [15]:
# Creating Class Column [1=True & 0=False]
true_data['class']=1
fake_data['class']=0

In [16]:
# Concatenating both datasets
df = pd.concat([fake_data,true_data], axis=0)

In [17]:
df.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [18]:
# Dropping unnecessary columns
df = df.drop(["title", "subject", "date"], axis=1)

# Applying the function to text column
print('Cleaning data')
df["text"] = df["text"].apply(clean_text.clean_text)

Cleaning data


In [19]:
#Getting length of news articles
print('Getting word count of each article')
df['word_count'] = df['text'].apply(text_length.get_text_length)
df.head()

Getting word count of each article


Unnamed: 0,text,class,word_count
0,donald trump just couldn t wish all americans ...,0,454
1,house intelligence committee chairman devin nu...,0,304
2,on friday it was revealed that former milwauke...,0,542
3,on christmas day donald trump announced that h...,0,411
4,pope francis used his annual christmas day mes...,0,420


In [20]:
#removing articles by length
print('removing articles with outofrange length')
df = remove_outranged_length_articles.articles_by_length(df, data_configs['min_article_length'], data_configs['max_article_length'])

removing articles with outofrange length
removed 1172 articles which are less than 20 words
removed 45 articles which are greater than 4000 words


In [21]:
#removing duplicate Data
print('removing duplicate articles')
df = remove_duplicates.remove_duplicate_data(df, 'text')

removing duplicate articles
Removed 5451 duplicate rows from data


In [22]:
#Preparing data for model training
x_train, x_test, y_train, y_test = process_training_data.process_data(df)


preparing data for traning the model


In [23]:
#training model
model = LogisticRegression.lr(x_train, y_train)

Initializing model: LogisticRegression
Training model


In [24]:
#model prediction
prediction = model_prediction.predict_model(model, x_test)

In [25]:
#Calculating model accuracy
model.score(x_test, y_test)

0.985090243264452