In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
!pip install nltk
from nltk.stem.porter import PorterStemmer

pd.options.display.max_columns = 30



You should consider upgrading via the 'pip install --upgrade pip' command.[0m


# Read in the FOX and MSNBC dataframes
Text files include up to 200 pages of transcript from each network with show hosts, show names, formatting and stage directions removed.

In [2]:
df_fox = pd.read_csv('fox2.txt', delimiter = "\n", error_bad_lines=False, engine='python')

Skipping line 1074: '
' expected after '"'
Skipping line 1102: '
' expected after '"'
Skipping line 1152: unexpected end of data


In [3]:
df_fox['channel'] = "fox"

In [4]:
df_fox.head()

Unnamed: 0,script,channel
0,The sausage-making machine sometimes spits out...,fox
1,These are negotiations primarily through Democ...,fox
2,"All right, finally, something I can relate t...",fox
3,"The fact of the matter is, they're going to ac...",fox
4,"Food for thought. This is some pricey sausage,...",fox


In [5]:
df_msnbc = pd.read_csv('msnbc2.txt', delimiter = "\n", error_bad_lines=False)

In [6]:
df_msnbc.head()

Unnamed: 0,script
0,"If you run, please come back and let us know...."
1,Tonight on ALL IN.
2,I`m not the only person on screen right now w...
3,The Matt Gaetz case gets even stranger. He no...
4,"No, I didn`t know about it. Yes, I`m surprise..."


In [7]:
df_msnbc['channel'] = "msnbc"

## Filtering to lines with more than 10 words

In [8]:
df = pd.concat([df_fox, df_msnbc], sort=False).reset_index()

In [9]:
df['words'] = df['script'].map(lambda x: len(x.split(" ")))

In [10]:
df['channel_bool'] = (df.channel == 'fox').astype(int)

In [11]:
df_v1 = df

In [12]:
df

Unnamed: 0,index,script,channel,words,channel_bool
0,0,The sausage-making machine sometimes spits out...,fox,23,1
1,1,These are negotiations primarily through Democ...,fox,56,1
2,2,"All right, finally, something I can relate t...",fox,27,1
3,3,"The fact of the matter is, they're going to ac...",fox,24,1
4,4,"Food for thought. This is some pricey sausage,...",fox,76,1
...,...,...,...,...,...
3302,2155,"Meanwhile, today, there`s also Oscar news rol...",msnbc,51,0
3303,2156,The movie is directed by Travon Free and stars...,msnbc,21,0
3304,2157,There may be a theme here that strong art conf...,msnbc,17,0
3305,2158,"Now, on , we s our congrats and good luck to a...",msnbc,69,0


In [13]:
df.words.describe()

count    3307.000000
mean       48.191715
std        75.762773
min         1.000000
25%        18.000000
50%        38.000000
75%        58.000000
max      1426.000000
Name: words, dtype: float64

In [14]:
df = df[df.words > 15]

In [15]:
df['channel'].value_counts()

msnbc    1673
fox       913
Name: channel, dtype: int64

# TF-IDF Vectorizer

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer


porter_stemmer = PorterStemmer()

def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=stemming_tokenizer, use_idf=True, norm='l1', max_features=300)
X = tfidf_vectorizer.fit_transform(df['script'])
pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())



Unnamed: 0,--,abl,absolut,act,actual,administr,ago,agre,alleg,allow,alreadi,alway,america,american,ani,...,wasn,watch,way,week,went,whi,white,wit,women,word,work,world,ye,year,york
0,0.160224,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.292059,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.141433,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.134585,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2581,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2582,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2583,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.426254,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2584,0.078439,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train Test Split

In [17]:
y = df['channel_bool']

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train, y_train)

LinearSVC()

In [20]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['msnbc', 'fox'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted msnbc,Predicted fox
Is msnbc,377,39
Is fox,138,93


In [21]:
clf.score(X_test, y_test)

0.7264296754250387

In [28]:
y_pred2 = clf.predict(X)
df['prediction'] = pd.DataFrame(y_pred2)
df.prediction.value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['prediction'] = pd.DataFrame(y_pred2)


0.0    1669
NaN     492
1.0     425
Name: prediction, dtype: int64

In [29]:
X_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())

In [30]:
import eli5

feature_names=list(X_df.columns)
eli5.show_weights(clf, feature_names=feature_names)

Weight?,Feature
+4.139,end
+2.844,border
+2.638,cuomo
+2.488,mask
+2.359,polici
+2.237,economi
+2.174,china
+2.103,filibust
+1.944,immigr
+1.881,trillion


In [31]:
eli5.explain_weights(clf, feature_names=feature_names, top=300)

Weight?,Feature
+4.139,end
+2.844,border
+2.638,cuomo
+2.488,mask
+2.359,polici
+2.237,economi
+2.174,china
+2.103,filibust
+1.944,immigr
+1.881,trillion


In [32]:
#Notes:
#Might edit tokenizer for cleaner answers or take out some words
#Some are NaN and not sure why. Need to fix that

In [33]:
df

Unnamed: 0,index,script,channel,words,channel_bool,prediction
0,0,The sausage-making machine sometimes spits out...,fox,23,1,1.0
1,1,These are negotiations primarily through Democ...,fox,56,1,1.0
2,2,"All right, finally, something I can relate t...",fox,27,1,1.0
3,3,"The fact of the matter is, they're going to ac...",fox,24,1,0.0
4,4,"Food for thought. This is some pricey sausage,...",fox,76,1,1.0
...,...,...,...,...,...,...
3302,2155,"Meanwhile, today, there`s also Oscar news rol...",msnbc,51,0,
3303,2156,The movie is directed by Travon Free and stars...,msnbc,21,0,
3304,2157,There may be a theme here that strong art conf...,msnbc,17,0,
3305,2158,"Now, on , we s our congrats and good luck to a...",msnbc,69,0,
