In [99]:
#IMPORTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from torchvision import transforms
from nltk import TweetTokenizer
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
import random
import math

from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
import string

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()


In [253]:
#load data
tweet_path = "./data/tweetsUsers.csv"

#load data with pandas and return dataframes (converted to numpy arrays)
def load_tweets(tweet_path):
    tweets = pd.read_csv(tweet_path)
    
    #choose first 100 for each user for testing purposes
    tweets = tweets.sort_values('screen_name').groupby('screen_name').head(100)
    comments = tweets['text']
    tweets['screen_name'][tweets['screen_name'] == 'HillaryClinton'] = 0
    tweets['screen_name'][tweets['screen_name'] == 'AnnCoulter'] = 1
    tweets['screen_name'][tweets['screen_name'] == 'TrumpsGAGirl'] = 2
    tweets['screen_name'][tweets['screen_name'] == 'realDonaldTrump'] = 3
    tweets['screen_name'][tweets['screen_name'] == 'TomiLahren'] = 4
    tweets['screen_name'][tweets['screen_name'] == 'MADENAMERUCA'] = 5
    tweets['screen_name'][tweets['screen_name'] == 'LastStand2019'] = 6
    tweets['screen_name'][tweets['screen_name'] == 'Mecdty'] = 7
    tweets['screen_name'][tweets['screen_name'] == 'Birdle_2963'] = 8

#     print(tweets['screen_name'].unique())
    users = tweets['screen_name']
#     data.Gender[data.Gender == 'female'] = 2
    return comments.to_numpy(), users.to_numpy()

tweets, users = load_tweets(tweet_path)
tweets = tweets[0:500] #use first 500 for testing
users = users[0:500].astype(np.long)

#remove old style retweets, hashtags and hyperlinks in a string tweet
def clean_tweet(tweet):
    tweet = re.sub(r'^RT[\s]+', '', tweet) 
    tweet = re.sub(r'#', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    return tweet

#remove punctuation and stopwords in a list of tokens
def clean_tweets(tweet_tokens):
    tweets_clean = []
    for word in tweet_tokens:
        if ((word not in stopwords_english) and (word not in string.punctuation)): # remove punctuation + stopwords
#             word = stemmer.stem(word) #stem
            tweets_clean.append(word)
    return tweets_clean

#tokenize and clean tweets: return list of dicts representing the tokens {word: True}
def tokenize(tweets):
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)
    count = 0
    tokens = [tokenizer.tokenize(clean_tweet(tweet)) for tweet in tweets]
    tokens = [clean_tweets(token) for token in tokens]
    tokens = [" ".join(token) for token in tokens] #convert to list of cleaned strings
    
    #use bag of words model
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(tokens)
    #CHECK STEP
#     print(X[0])
#     print(vectorizer.get_feature_names()[3246])
#     print(tweets[0])
#     print(users[0])
    X = X.toarray()
    X = X.astype(np.float)
    return X, len(vectorizer.get_feature_names())

tokens, num_feats = tokenize(tweets) #tokens = [[0, 1, 0, 1], [0, 1, 0, 1,], ....]
# print(tokens[0])
# print(tokens)

In [203]:
# ############################################################
# # Extracting and loading data
# ############################################################
class Dataset(Dataset):
    def __init__(self, X, y):
        self.len = len(X)           
        if torch.cuda.is_available():
            self.x_data = torch.from_numpy(X).float().cuda()
            self.y_data = torch.from_numpy(y).long().cuda()
        else:
            self.x_data = torch.from_numpy(X).float()
            self.y_data = torch.from_numpy(y).long()
    
    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        return self.x_data[idx], self.y_data[idx]

In [245]:
"""
    Randomly choose 20 percent of the training data as validation data.

    Args:
        x_train: tweets in form list of lists []
        y_train: training labels in list of strings (users) [realDonald, Hillary, ...]
    Returns:
        new_x_train: 80%
        new_y_train
        x_val: 20%
        y_val
"""

def create_validation(x_train, y_train):
    num_images = len(x_train)
    num_validation = math.floor(.2 * num_images)    
#     print(num_images)
    
    #indices of the validation set, random
    validation_indices = random.sample(range(0, num_images), num_validation)
    
    #get the indices of the training set 
    training_indices = []
    for i in range(0, num_images):
        if i not in validation_indices:
            training_indices.append(i)
    
    x_val = np.take(x_train, validation_indices, axis = 0)
    y_val = np.take(y_train, validation_indices, axis = 0)
    
    new_x_train = np.take(x_train, training_indices, axis = 0)
    new_y_train = np.take(y_train, training_indices, axis = 0)
        
    return new_x_train,new_y_train,x_val,y_val

new_x_train, new_y_train, x_val, y_val = create_validation(tokens, users)
# print(new_x_train)
# print(x_val[0])


In [248]:
############################################################
# Feed Forward Neural Network
############################################################
class FeedForwardNN(nn.Module):
    """ 
        (1) Use self.fc1 as the variable name for your first fully connected layer
        (2) Use self.fc2 as the variable name for your second fully connected layer
    """
    hidden_size = 1000
    def __init__(self, input_size, num_classes):
        super(FeedForwardNN, self).__init__()
        #first fully connected layer
        self.fc1 = nn.Linear(input_size, self.hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(self.hidden_size, num_classes) 

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out




In [252]:
"""
    Runs experiment on the model neural network given a train loader, loss function and optimizer and find validation accuracy for each epoch given the validation_loader.

    Args:
        neural_network (NN model that extends torch.nn.Module): For example, it should take an instance of either
                                                                FeedForwardNN or ConvolutionalNN,
        train_loader (DataLoader),
        validation_loader (DataLoader),
        loss_function (torch.nn.CrossEntropyLoss),
        optimizer (optim.SGD)
        num_epochs (number of iterations)
    Returns:
        tuple: First position, training accuracies of each epoch formatted in an array of shape (num_epochs,1).
               Second position, training loss of each epoch formatted in an array of shape (num_epochs,1).
               third position, validation accuracy of each epoch formatted in an array of shape (num_epochs,1).
               
"""

def train_val_NN(neural_network, train_loader, validation_loader, loss_function, optimizer, num_epochs):
    accuracy = np.empty((num_epochs,1))
    loss_np = np.empty((num_epochs,1))
    val_accuracy = np.empty((num_epochs,1))
    
    model = neural_network
    
    #train first 
    for epoch in range(num_epochs):
        #train on batch
        total_loss = 0
        
        for i, (images, labels) in enumerate(train_loader):
            # Forward pass
            outputs = model(images)
            loss = loss_function(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            #add loss
            total_loss += loss.item()
        
        #get validation accuracy for this epoch
        val_correct = 0
        val_total = 0
        for images, labels in validation_loader:
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()       
        
         #get training accuracy for this epoch
        train_correct = 0
        train_total = 0
        for images, labels in train_loader:
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        
        accuracy[epoch] = train_correct/train_total
        loss_np[epoch] = total_loss
        val_accuracy[epoch] = val_correct/val_total
        
    return (accuracy,loss_np,val_accuracy)

In [None]:
#run FeedForward
from torch.utils.data import DataLoader

#initialize params
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = neural_network.to(device)
learning_rate = 0.001
batch_size = 64
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)
num_epochs = 40

#load data
y_train = users
x_train = tweets

tokenized, size = tokenize(x_train)
# print("size" + str(size))

new_x_train, new_y_train, x_val, y_val = create_validation(tokenized, y_train)
neural_network = FeedForwardNN(size, 9)

#load into dataloader/dataset
train_dataset =  Dataset(new_x_train, new_y_train)
validation_dataset = Dataset(x_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=12, shuffle=False)
validation_loader = DataLoader(validation_dataset, batch_size=12, shuffle=False)

ffaccuracy,ffloss_np,ffval_accuracy = train_val_NN(neural_network, train_loader, validation_loader, loss_function, optimizer,num_epochs)
print(ffaccuracy)
print(ffloss_np)
print(ffval_accuracy)

#plot per epoch
plt.figure()
plt.ylabel('epoch')
plt.plot(ffaccuracy, label='accuracy')
plt.legend()
plt.show()
    
plt.figure()

plt.plot(ffloss_np, label='loss_np')
plt.legend()
plt.ylabel('epoch')
plt.show()
    
plt.figure()
plt.legend()
plt.plot(ffval_accuracy, label='val_accuracy')
plt.show()

In [None]:
"""
    Runs experiment on the model neural network given a test loader, loss function and optimizer.

    Args:
        neural_network (NN model that extends torch.nn.Module): For example, it should take an instance of either
                                                                FeedForwardNN or ConvolutionalNN,
        test_loader (DataLoader), (make sure the loader is not shuffled)
        loss_function (torch.nn.CrossEntropyLoss),
        optimizer (your choice)
        num_epochs (number of iterations)
    Returns:
        your predictions         
"""
def test_NN(neural_network, test_loader, loss_function, size_test):
    model = neural_network
    with torch.no_grad():
        correct = 0
        total = 0
        Preds = []
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images, size_test)
            _, predicted = torch.max(outputs.data, 1)
            Preds.append(predicted)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return Preds
# 
# 
# with open('HW4_preds.txt', 'w') as f:
#     for item in Preds:
#         f.write("%s\n" % item)

In [None]:
# ############################################################
# # Convolutional Neural Network
# ############################################################
# class ConvolutionalNN(nn.Module):
#     """ 
#     (1) Use self.conv1 as the variable name for your first convolutional layer
#         (2) Use self.pool1 as the variable name for your first pooling layer
#         (3) Use self.conv2 as the variable name for your second convolutional layer
#         (4) Use self.pool2 as the variable name for you second pooling layer  
#         (5) Use self.fc1 as the variable name for your first fully connected layer
#         (6) Use self.fc2 as the variable name for your second fully connected layer
#     """
#      # Hyper-parameters 
#     input_size = 3
#     hidden_size = 2000
#     num_classes = 7
#     num_epochs = 40
#     batch_size = 64
#     learning_rate = 0.01
    
    
#     def __init__(self):
#         super(ConvolutionalNN, self).__init__()
#         self.conv1 = nn.Conv2d(2, 16, kernel_size=3, stride=1, padding=0)
#         self.relu = nn.ReLU()
#         self.pool1 = nn.MaxPool2d(kernel_size=2)
        
#         #reshape fc1 = C*H*W
#         self.fc1 = nn.Linear(8512, 200)
      
#     def forward(self, x):
#         out = self.conv1(x)
#         out = self.relu(out)
#         out = self.pool1(out)
        
#         #reshape out to be 146624
#         (_, C, H, W) = out.data.size()
#         print("size: " + str(C*H*W))
#         out = out.view( -1 , C * H * W)
#         out = self.fc1(out)

#         return out
      

In [129]:
# # Run Baseline CNN
# from torch.utils.data import DataLoader

# #initialize params
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# neural_network = ConvolutionalNN()
# model = neural_network.to(device)
# learning_rate = 0.001
# batch_size = 64
# loss_function = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adagrad(neural_network.parameters(), lr=learning_rate)
# num_epochs = 40

# new_x_train, new_y_train, x_val, y_val = create_validation(tweets, users)

# new_x_train = tokenize(new_x_train)
# x_val = tokenize(x_val)

# #load into dataloader/dataset
# train_dataset =  Dataset(new_x_train, new_y_train)
# validation_dataset = Dataset(x_val, y_val)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
# validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

# accuracy,loss_np,val_accuracy = train_val_NN(neural_network, train_loader, validation_loader, loss_function, optimizer,num_epochs)
# print(accuracy)
# print(loss_np)
# print(val_accuracy)

# #plot per epoch
# plt.figure()
# plt.plot(accuracy, label='accuracy')
# plt.show()
    
# plt.figure()
# plt.plot(loss_np, label='loss_np')
# plt.show()
    
# plt.figure()
# plt.plot(val_accuracy, label='val_accuracy')
# plt.show()

RuntimeError: Expected 4-dimensional input for 4-dimensional weight [16, 2, 3, 3], but got 2-dimensional input of size [64, 2805] instead