In [2]:
import os
from sklearn.model_selection import train_test_split
from nltk.tokenize.casual import TweetTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn import metrics
from xml.etree import ElementTree
from sklearn.pipeline import Pipeline, FeatureUnion
import matplotlib.pyplot as plt

In [3]:
config = {
    'dataset_name': 'PAN 2018 English',
    'xmls_directory': '../data/pan18-author-profiling-training-dataset-2018-02-27/en/text/',
    'truth_path': '../data/pan18-author-profiling-training-dataset-2018-02-27/en/en.txt',
    'txts_destination_directory': '../data/pan18-author-profiling-training-dataset-2018-02-27/en',
}

In [4]:
def load_pan_data(xmls_directory, truth_path, txts_destination_directory):
    """ 
    Loads Pan data

    @return: 
    1. merged tweets of authors
    2. truths(read genders of the authors)
    3. author ids
    4. original tweet lengths of authors

    @TODO:
        Make sure that the data is read properly and is not misaligned
    """

    # read tweets of the authors
    # read the filenames from the xmls_dir
    xmls_filenames = sorted(os.listdir(xmls_directory))

    # xml filename = (author_id.xml)
    # to ge author id: split(filename)[0]
    author_ids = []
    for xml_filename in xmls_filenames:
        author_ids.append(xml_filename[:-4])

    #### truths --> go to truth location
    # split(:::)
    # get the first element
    # make sure that order is same in both cases, we are mapping the same things
    # since, author ids are sorted, hence truth file should be sorted as well
    truths_temp = []
    with open(truth_path, 'r') as truth_file:
        # sort ids
        for line in sorted(truth_file):

            line.rstrip('\n')

            entry = line.split(':::')
            truths_temp.append(entry)

    truths = []
    # make sure allignment is correct
    for author_idx, truth_vector in enumerate(truths_temp):

        if(author_ids[author_idx] != truth_vector[0]):
            print(author_ids[author_idx], truths_temp[0])
            print("Ids in truths file and Ids in author ids array do not align")
            return
        truths.append(truth_vector[1])

    ############# truths are constructed as well ###########################
    ##### form: merged tweets and original tweet lengths of authors ########

    # files are xml files 
    # we need ElementTree module
    original_tweet_lengths = []

    merged_tweets = []

    # get filenames and read tweets
    for author_idx, xml_filename in enumerate(xmls_filenames):
        # read filename
        # construct tree of xml
        tree = ElementTree.parse(os.path.join(xmls_directory, xml_filename), parser = ElementTree.XMLParser(encoding = 'utf-8'))

        # get the root element of file
        root = tree.getroot()

        original_tweet_lengths.append([])

        tweets_of_this_author = []

        # root[0] --> first level of the tree
        # since the tweets are in 1st level 
        for child in root[0]:

            tweet = child.text 

            original_tweet_lengths[author_idx].append(len(tweet))

            # replace \n with lineFeed
            tweet.replace('\n', '<LineFeed>')

            tweets_of_this_author.append(tweet)

        
        # store tweets as string
        # string separated by <EndOfTweet> 
        merged_tweets_of_this_author = "<EndOfTweet>".join(tweets_of_this_author)+"<EndOfTweet>"

        merged_tweets.append(merged_tweets_of_this_author)

    return merged_tweets, truths, author_ids, original_tweet_lengths


In [5]:

print("Starting the project...")

    ### 1 -> Read the data from the files
merged_tweets, truths, author_ids, original_tweet_lengths = load_pan_data(config['xmls_directory'], config['truth_path'], config['txts_destination_directory'])
print("Loaded Pan data")

Starting the project...
Loaded Pan data


In [6]:
docs_train, docs_test, y_train, y_test, author_ids_train, author_ids_test, original_tweet_lengths_train, original_tweet_lengths_test\
    = train_test_split(merged_tweets, truths, author_ids, original_tweet_lengths, test_size = 0.4, random_state = 42, stratify = truths)
print("Performed train test split")

Performed train test split


In [7]:
################## Better tweet preprocesser ######################

# import main libraries
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer


[nltk_data] Downloading package wordnet to /home/rishabh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rishabh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
import re

In [96]:
my_tweet = "Yo.. Life is 2 good xoxo #fun3Lyf #amazing @Ramsharam"


# preprocess this
#1. remove digits and lowercase the words
my_tweet = re.sub('\d', '', my_tweet)
#2. lowercase digits
my_tweet = my_tweet.lower()



######## remove punctuations ###############
def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', (word))
        if new_word != '':
            new_words.append(new_word)
    return new_words

my_tweet= ' '.join(remove_punctuation(my_tweet.split(' ')))

###############################################
# Lemmatization and tokenization using TweetTokenizer
lemmatizer = nltk.stem.WordNetLemmatizer()
tokenizer = TweetTokenizer()

my_tweet_words = []
for word in tokenizer.tokenize(my_tweet):
    my_tweet_words.append(lemmatizer.lemmatize(word))

my_tweet = ' '.join(my_tweet_words)


my_tweet

'yo life is good xoxo funlyf amazing ramsharam'

In [9]:
def preprocess_tweet(my_tweet):
    """
    This function will preprocess the input tweet

    Steps for preprocessing:
        1. Lowercase the letters
        2. Replace the characters with frequency greater than 3 with 3 in a word
        3. Replace a url with Tag: <URLURL>
        4. Replace a tag mention: <UsernameMention>

    
    @TODO:
        1. Look for better preprocessing methods on the web
        2. Apply here
    """
    # steps 1 and 2 done
    # 1. Remove digits and lowercase the text
    # remove digits with empty string
    my_tweet = re.sub('\d', '', my_tweet)
    #2. lowercase digits
    my_tweet = my_tweet.lower()

    tokenizer = TweetTokenizer(preserve_case = False, reduce_len = True)
    tokens = tokenizer.tokenize(my_tweet)
    
    for index, token in enumerate(tokens):
        if(token[0:8] == "https://"):
            


    ######## remove punctuations ###############
    ### remove @username mentions with <UsernameMention>
    for 


    return preprocessed_tweet

In [10]:
def extract_features(docs_train, docs_test, perform_dimensionality_reduction):
    """ 
    We will extract features from the dataset, preprocess it and return the X_train and X_test
    
    @return:
        1. X_train: Feature matrix for training data
        2. X_test: Feature matrix for test data


    @Regions of improvement:
        1. Get more features and use them to get more accurate predictions 
    @TODO:
        We are only taking word-ngrams, Use char-ngrams in the features as well
    """
    word_ngram_range = (1, 3)
    # perform_dimensionality_reduction = True
    # print(docs_test[0])

    '''
    Build a char_vectorizer and combine word_vectorizer and char_vectorizer to make an n_grams vectorizer
    '''

    word_vectorizer = TfidfVectorizer(preprocessor=preprocess_tweet,
                                    analyzer='word',
                                    ngram_range=word_ngram_range,
                                    min_df=2,
                                    use_idf=True, 
                                    sublinear_tf=True)
    print("Created a word vectorizer")

    char_vectorizer = TfidfVectorizer(preprocessor=preprocess_tweet,
                                     analyzer='char', 
                                     ngram_range=(3, 5),
                                     min_df=2, 
                                     use_idf=True, 
                                     sublinear_tf=True)
    print("Created a character vectorizer")

    '''
    Merge the two vectorizers using a pipeline
    '''
    ngrams_vectorizer = Pipeline([('feats', FeatureUnion([('word_ngram', word_vectorizer),
                                                         ('char_ngram', char_vectorizer),
                                                         ])),
                                 # ('clff', LinearSVC(random_state=42))
                                 ])
    


    # fitTransform this thing 
    X_train = ngrams_vectorizer.fit_transform(docs_train) #it will take a lot of time... i think
    X_test = ngrams_vectorizer.fit_transform(docs_test)
    print("Performed fitting of data")
    ############ perform dimensionality reduction ################
    
    if(perform_dimensionality_reduction == True):
        print("Performing dimensionality reduction")
        # use TruncatedSVD to reduce dimensionality of our dataset
        svd = TruncatedSVD(n_components = 300, random_state = 42)

        X_train = svd.fit_transform(X_train)
        X_test = svd.fit_transform(X_test)
        print("Performed dimensionality reduction")


    # print(docs_train[0])
    return X_train, X_test

    # n-grams based on characters and words 


    # use different methods for feature extraction

In [11]:
def train_and_test_model(clf, X_train, y_train, X_test, y_test):
    # training phase
    # fit the X_train, y_train on the clf
    clf.fit(X_train, y_train)

    '''
    Predict the output of the test set
    '''
    y_predicted = clf.predict(X_test)

    '''
    Build the confusion matrix
    '''
    # confusion_matrix = metrics.confusion_matrix(y_test, y_predicted)

    # # plt.imshow(confusion_matrix)

    # # plt.set_cmap('jet')

    # # plt.show()


    ###################### print the accuracy of our classifier ###########################
    accuracy = metrics.accuracy_score(y_test, y_predicted)
    print(f'Accuracy of our classifier is : {accuracy}')


Starting the project...


NameError: name 'entry' is not defined

In [12]:
author_ids_train, docs_train, y_train, original_tweet_lengths_train = [list(tuple) for tuple in zip(*sorted(zip(
        author_ids_train, docs_train, y_train, original_tweet_lengths_train)))]
    # Sort the test set
author_ids_test, docs_test, y_test, original_tweet_lengths_test = [list(tuple) for tuple in zip(*sorted(zip(
    author_ids_test, docs_test, y_test, original_tweet_lengths_test)))]

print("Sorted the records based on author ids")

Sorted the records based on author ids


In [13]:
from sklearn.neighbors import KNeighborsClassifier

# decision tree classifier
from sklearn.tree import DecisionTreeClassifier

#random forest classifier
from sklearn.ensemble import RandomForestClassifier

#naive bayes classifier
from sklearn.naive_bayes import GaussianNB

In [16]:
#########################################################################
'''
Try different classifiers and compare their accuracies
'''
# make a classifier
clf = LinearSVC(random_state = 42) # --> 0.5508

################ different classifiers ##########################
# Naive Bayes classifier
# clf = GaussianNB() ## --> 0.5275


# Random Forest classifier
# clf = RandomForestClassifier(n_estimators=250) #--> 0.521


# Decision Tree classifier
# clf = DecisionTreeClassifier() --> 0.51

# k nearest neighbours classifier
# clf = KNeighborsClassifier(n_neighbors=5,  metric='minkowski', p=2) #--> 0.58

#########################################################################
print("Built a classifier")



Built a classifier


In [15]:
X_train, X_test = extract_features(docs_train, docs_test, perform_dimensionality_reduction=True)
print("Successfully extracted features from the documents")

Created a word vectorizer
Created a character vectorizer
Performed fitting of data
Performing dimensionality reduction
Performed dimensionality reduction
Successfully extracted features from the documents


In [17]:
    # training and testing phase
train_and_test_model(clf, X_train, y_train, X_test, y_test)
print("Done training of the dataset")

Accuracy of our classifier is : 0.4241666666666667
Done training of the dataset
