In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score, validation_curve



In [2]:
train_df = pd.read_csv('data/cleanfilmdata.csv')
test_df = pd.read_csv('data/tweetdata.csv')

In [3]:
import re

In [4]:
from sklearn.feature_extraction import DictVectorizer
class SegmentFeaturizer:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.future_words = ["tomorrow", "future", "futures"]
    
    @staticmethod
    def count_propernouns(doc):
        segment = doc.text.lower().split()
        count = 0 
        num = 0
        for token in doc:
            if token.tag_ in ['NNP', 'NNPS']:
                count+=1
                num += 1
            else:
                num += 1
        if(count == 0):
            average = 0
        else:
            average = count/num
        return average

    @staticmethod
    def get_n_words_before_main_verb(doc):
        numbers = [0]
        for sent in doc.sents:
            main = [t for t in sent if t.dep_ == "ROOT"][0]
            if main.pos_ == "VERB":
                dist_to_init = main.i - sent[0].i
                numbers.append(dist_to_init)
        return np.mean(numbers)

    @staticmethod
    def get_n_complex_clauses(doc):
        embedded_elements_count = []
        for sent in doc.sents:
            n_embedded = len(
                [t for t in sent if t.dep_ in {"ccomp", "xcomp", "advcl", "dative"}]
            )
            embedded_elements_count.append(n_embedded)
        if len(embedded_elements_count) == 0:
            return 0
        else:
            return np.mean(embedded_elements_count)
    
    @staticmethod
    def get_mean_sentiment(doc):
        return doc.sentiment
    @staticmethod
    def get_pronouns(doc):
        count = 0
        num = 0
        for token in doc:
            if token.pos_ == "PRON":
                count += 1
                num += 1
            else:
                num += 1
        if count == 0 or num == 0:
            average = 0
        else:
            average = count/num
        return average
    @staticmethod
    def get_female_pronouns(doc):
        count = 0
        num = 0
        for token in doc:
            if token.text.lower() in ['her', 'she', 'wife', 'girlfriend']:
                count += 1
                num += 1
            else:
                num+=1
        if count == 0 or num == 0:
            average = 0
        else:
            average = count/num
        return average
    @staticmethod
    def get_male_pronouns(doc):
        count = 0
        num = 0
        
        for token in doc:
            if token.text.lower() in ['he', 'him', 'husband', 'honey', 'his', 'boyfriend']:
                count += 1
                num += 1
            else:
                num += 1
        if count == 0 or num == 0:
            average = 0
        else:
            average = count/num
        return average
    def get_swear_words(doc):
        count = 0
        num = 0
        for token in doc:
            if token.lemma_.lower() in ["fuck", "shit", "bitch", "hell", "asshole", "ass"]:
                count += 1
                num += 1
            else:
                num += 1
        if count == 0 or num == 0:
            average = 0
        else:
            average = count/num
        return average
    # putting it all together!
    def featurize(self, segments):
        feature_dicts = []
        docs = self.nlp.pipe(segments)
        for doc in docs:
            feature_dict = {
                
                "n_propernouns": self.count_propernouns(doc),
                "n_words_before_main_verb": self.get_n_words_before_main_verb(doc),
                "n_complex_clauses": self.get_n_complex_clauses(doc),
                "mean_sentiment": self.get_mean_sentiment(doc),
                "n_pronouns": self.get_pronouns(doc),
                "n_male_pronouns": self.get_male_pronouns(doc),
                "n_female_pronouns": self.get_female_pronouns(doc)
                
            }
            feature_dicts.append(feature_dict)
        return feature_dicts

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

segment_featurizer = SegmentFeaturizer()
class CustomLinguisticFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, x, y=None):
        return self
    def transform(self, data):
        return segment_featurizer.featurize(data)

In [6]:
manual_pipeline = Pipeline(
    steps=[
        ("stats", CustomLinguisticFeatureTransformer()),
        ("dict_vect", DictVectorizer()),
        ("classifier", LinearSVC()),
    ]
)

In [7]:
tweets = pd.read_csv('data/tweetdata.csv')

In [None]:
manual_pipeline.fit(train_df['text'], train_df['target'])
y_pred = manual_pipeline.predict(tweets['text'])
crmanual = classification_report(tweets['target'], y_pred)

