In [1]:
import pickle 

channels = ['ndtv', 'indiatoday', 'republic']
data = {}
for c in (channels):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = [file.read().decode("utf-8") ]

In [2]:
data.keys()

dict_keys(['ndtv', 'indiatoday', 'republic'])

In [3]:
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
indiatoday,[Music]\r\n\r\nthe Supreme Court's our yoga verdict was\r\n\r\ndelivered today five judge unanimous\r\n\r\nverdict that has given several big\r\n\...
ndtv,hello and welcome you're watching left\r\n\r\nright and center I'm Sanka the father it\r\n\r\nis curtains on a very old case of\r\n\r\nproperty di...
republic,hello and welcome ladies and gentlemen\r\n\r\nthis is live edition of the Sunday\r\n\r\ndebate and yesterday we it was a\r\n\r\nhistoric day when ...


In [4]:
data_df.transcript.loc['ndtv']



In [5]:
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [6]:
data_clean = pd.DataFrame(data_df.transcript.apply(round1))

In [7]:
data_clean

Unnamed: 0,transcript
indiatoday,\r\n\r\nthe supreme courts our yoga verdict was\r\n\r\ndelivered today five judge unanimous\r\n\r\nverdict that has given several big\r\n\r\ntakea...
ndtv,hello and welcome youre watching left\r\n\r\nright and center im sanka the father it\r\n\r\nis curtains on a very old case of\r\n\r\nproperty disp...
republic,hello and welcome ladies and gentlemen\r\n\r\nthis is live edition of the sunday\r\n\r\ndebate and yesterday we it was a\r\n\r\nhistoric day when ...


In [8]:
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [9]:
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
indiatoday,\r\rthe supreme courts our yoga verdict was\r\rdelivered today five judge unanimous\r\rverdict that has given several big\r\rtakeaways if youve ju...
ndtv,hello and welcome youre watching left\r\rright and center im sanka the father it\r\ris curtains on a very old case of\r\rproperty dispute which th...
republic,hello and welcome ladies and gentlemen\r\rthis is live edition of the sunday\r\rdebate and yesterday we it was a\r\rhistoric day when the iot a ve...


In [10]:
data_df
data_df.to_pickle("corpus.pkl")

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,aapke,aaron,aarthi,ab,aberration,abided,ability,able,absent,absolutely,...,youre,youve,yupi,zafar,zakir,zameen,zameer,zfg,zoom,zubrowka
indiatoday,0,0,0,0,0,0,0,7,0,0,...,4,1,1,0,0,1,0,0,0,1
ndtv,0,0,2,0,0,0,0,4,0,3,...,5,3,0,1,1,0,2,0,0,0
republic,2,1,0,1,1,1,3,3,1,5,...,7,4,0,0,0,0,0,1,2,0


In [12]:
data_dtm.to_pickle("dtm.pkl")

In [13]:
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))