In [92]:
import pandas as pd
import re
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from rake_nltk import Rake
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kyle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kyle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [93]:
def read_dataframe(csv_file) -> pd.DataFrame:
   
    df = pd.read_csv("test.csv")
    
    # replaces all "null" or "NaN" values with an empty string
    df.fillna("", inplace=True)
    
    # labels the columns in the dataset using the data dictionary described in the README
    df.columns = [
        'statement',     # Column 1: Statement.
        'label',         # Column 1: Label.

    ]
    
    return df

#create a dataframe from the training data
data = read_dataframe('train.csv')


In [94]:
stemmer = SnowballStemmer('english')
words = stopwords.words("english")

data['preprocessed'] = data['statement'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]"," ",x).split() if i not in words]).lower())

In [95]:
X_train, X_test,y_train,y_test = train_test_split(data['preprocessed'],data.label, test_size=.3)

In [96]:
pipeline = Pipeline([('vect',TfidfVectorizer(ngram_range=(1,1),stop_words = "english",sublinear_tf=True)),
                    ('chi',SelectKBest(chi2,k=1000)),
                    ('clf',LinearSVC(C=1.0,penalty='l1',max_iter=3000, dual = False))])

In [97]:
model = pipeline.fit(X_train,y_train)

vectorizer = model.named_steps['vect']
chi = model.named_steps['chi']
clf = model.named_steps['clf']

In [98]:
feature_names = vectorizer.get_feature_names()
feautre_names = [feature_names[i] for i in chi.get_support(indices = True)]
feautre_names = np.asarray(feautre_names)

In [99]:
# Output will be used for web crawling
# RAKE
news = "Donald Trump was sworn in as the 45th president of the United States on Jan. 20 after his stunning upset over Democratic rival Hillary Clinton."
r = Rake()
words = r.extract_keywords_from_text(news)
f_words = r.get_ranked_phrases()
print(f_words)

['democratic rival hillary clinton', 'united states', 'stunning upset', 'donald trump', '45th president', 'sworn', 'jan', '20']


In [100]:
print("accuracy_score " + str(model.score(X_test,y_test)))
print(model.predict(["Donald Trump was sworn in as the 45th president of the United States on Jan. 20 after his stunning upset over Democratic rival Hillary Clinton."]))



accuracy_score 0.5718015665796344
[False]
