In [35]:
import pandas as pd
import re
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kyle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
def read_dataframe(tsv_file: str) -> pd.DataFrame:
    
   
    df = pd.read_csv(tsv_file, delimiter='\t', dtype=object)
    
    # replaces all "null" or "NaN" values with an empty string
    df.fillna("", inplace=True)
    
    # labels the columns in the dataset using the data dictionary described in the README
    df.columns = [
        'id',                # Column 1: the ID of the statement ([ID].json).
        'label',             # Column 2: the label.
        'statement',         # Column 3: the statement.
        'subjects',          # Column 4: the subject(s).
        'speaker',           # Column 5: the speaker.
        'speaker_job_title', # Column 6: the speaker's job title.
        'state_info',        # Column 7: the state info.
        'party_affiliation', # Column 8: the party affiliation.
    
        'count_1', # barely true counts.
        'count_2', # false counts.
        'count_3', # half true counts.
        'count_4', # mostly true counts.
        'count_5', # pants on fire counts.
        
        'context' # Column 14: the context (venue / location of the speech or statement).
    ]
    
    return df

#create a dataframe from the training data
data = read_dataframe('train.tsv')


In [37]:
stemmer = SnowballStemmer('english')
words = stopwords.words("english")

data['preprocessed'] = data['statement'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]"," ",x).split() if i not in words]).lower())

In [38]:
X_train, X_test,y_train,y_test = train_test_split(data['preprocessed'],data.label, test_size=.3)

In [39]:
pipeline = Pipeline([('vect',TfidfVectorizer(ngram_range=(1,1),stop_words = "english",sublinear_tf=True)),
                    ('chi',SelectKBest(chi2,k=1000)),
                    ('clf',LinearSVC(C=1.0,penalty='l1',max_iter=3000, dual = False))])

In [40]:
model = pipeline.fit(X_train,y_train)

vectorizer = model.named_steps['vect']
chi = model.named_steps['chi']
clf = model.named_steps['clf']

In [41]:
feature_names = vectorizer.get_feature_names()
feature_names = [feature_names[i] for i in chi.get_support(indices = True)]
feature_names = np.asarray(feature_names)

In [45]:
#user input
user = input("Enter name: ")

p = str(model.predict([user]))
#remove special chars
output = p.replace('[\'', '').replace('\']', '')
print(output)

#accuracy score
accurate = (model.score(X_test,y_test))*100
round_a = round(accurate, 2)
print("Accuracy Score: " + str(round_a) + "%")

pants-fire
Accuracy Score: 22.33%
