In [1]:
import warnings
import pandas as pd
import numpy as np
from scipy import stats
import sklearn as sk
import itertools
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
import datetime
import re
import praw
import jsonlines


[nltk_data] Downloading package punkt to /home/ruru22/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ruru22/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ruru22/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
date_limits = ['2019-03-01 00:00:00', 
               '2019-06-01 00:00:00', 
               '2019-09-01 00:00:00', 
               '2020-01-01 00:00:00', 
               '2020-03-01 00:00:00', 
               '2020-06-01 00:00:00', 
               '2020-09-01 00:00:00', 
               '2021-01-01 00:00:00', 
               '2021-03-01 00:00:00', 
               '2021-06-01 00:00:00'
            ]

COUNT = len(date_limits)

In [5]:
def dfFromCSV(filename):
    posts = pd.read_csv(filename)
    return posts

def dfToCSV(posts, filename):
    posts.to_csv(filename, index=False)

def postsFromNDJSON(filename):
    posts = []
    with jsonlines.open(filename) as reader:
        for line in reader:
            line = line["Item"]
            posts.append([line["created"]["S"], line["title"]["S"], line["id"]["S"]])
    posts = pd.DataFrame(posts,columns=['created', 'title', 'id'])

    return posts

def sortByTime(data):
    data = data.sort_values(by='created')
    return data

posts = postsFromNDJSON("worldnews.jsonl")
posts = sortByTime(posts)
posts.drop(posts.loc[posts['title'] == 'None'].index, inplace=True)
dfToCSV(posts=posts, filename="worldnews.csv")
print(posts)


                    created  \
360671  2018-12-31 02:25:14   
270512  2018-12-31 02:27:21   
112640  2018-12-31 02:27:56   
294115  2018-12-31 02:29:05   
214742  2018-12-31 02:29:18   
...                     ...   
498320  2021-04-09 21:26:18   
427024  2021-04-09 21:26:19   
816977  2021-04-09 21:26:20   
144817  2021-04-09 21:26:21   
516521  2021-04-09 21:26:28   

                                                    title      id  
360671  پسکوف: هیچ اقدامی علیه مظنونان معرفی گردیده در...  ab2og2  
270512  One of the companies contracted by the governm...  ab2p2z  
112640  Donald Trump: Outgoing chief of staff John Kel...  ab2p8u  
294115  Todd Bowles Fired as Jets Head Coach After 4 S...  ab2pl2  
214742  Macron's former aide admits using diplomatic p...  ab2pn3  
...                                                   ...     ...  
498320  Prince Charles Visits Queen Elizabeth II After...  mnr2wp  
427024  Job postings hint at winners of NYC and London...  mnr2x1  
816977  Tell Us

In [8]:

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    lemmatizer = WordNetLemmatizer()

    # Tokenize words
    tokens = word_tokenize(text)

    # remove links
    words = []
    for token in tokens: 
        if 'http' not in token and 'www' not in token:
            words.append(token)

    # Remove stopwords and lemmatize words
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalnum() and word.lower() not in stop_words]

    words_split = []
    for word in words:
        words_split.extend(re.split('(\d+)',word))

    return ' '.join(words_split)

def isEnglish(text):
    try:
        text.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True


def separateNumbersAlphabets(str):
    numbers = re.findall(r'[0-9]+', str)
    alphabets = re.findall(r'[a-zA-Z]+', str)
    print(*numbers)
    print(*alphabets)

posts = dfFromCSV('worldnews.csv')

start = datetime.datetime.now()

lemmed = []

for idx in posts.index:
    if idx % 50000 == 0:
        print(idx / 1000)
    text = posts['title'][idx]

    text = preprocess(text)

    if isEnglish(text):
        lemmed.append(text)
    else:
        lemmed.append('DELETEME')

stop = datetime.datetime.now()
print("Took", stop - start, "seconds")

posts["title"] = lemmed

posts.drop(posts.loc[posts['title'] == 'DELETEME'].index, inplace=True)
posts.drop(posts.loc[posts['title'] == ''].index, inplace=True)

print(posts)
dfToCSV(posts, 'lemmatized_ascii.csv')

0.0
50.0
100.0
150.0
200.0
250.0
300.0
350.0
400.0
450.0
500.0
550.0
600.0
650.0
700.0
750.0
800.0
Took 0:01:19.261132 seconds
                    created  \
1       2018-12-31 02:27:21   
2       2018-12-31 02:27:56   
3       2018-12-31 02:29:05   
4       2018-12-31 02:29:18   
5       2018-12-31 02:30:30   
...                     ...   
818985  2021-04-09 21:26:18   
818986  2021-04-09 21:26:19   
818987  2021-04-09 21:26:20   
818988  2021-04-09 21:26:21   
818989  2021-04-09 21:26:28   

                                                    title      id  
1       one company contracted government charter ferr...  ab2p2z  
2       donald trump outgoing chief staff john kelly w...  ab2p8u  
3        todd bowles fired jet head coach  4  season team  ab2pl2  
4       macron former aide admits using diplomatic pas...  ab2pn3  
5       pep guardiola liverpool might best team world ...  ab2pzl  
...                                                   ...     ...  
818985  prince charles v