In [48]:
#IMPORTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from torchvision import transforms
from nltk import TweetTokenizer
import re
import string

from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
import string


In [58]:
#load data
tweet_path = "./data/tweetsUsers.csv"

#load data with pandas and return dataframes (converted to numpy arrays)
def load_tweets(tweet_path):
    tweets = pd.read_csv(tweet_path)
    
    #choose first 100 for each user for testing purposes
    tweets = tweets.sort_values('screen_name').groupby('screen_name').head(100)
    comments = tweets['text']
    users = tweets['screen_name']
    return comments.to_numpy(), users.to_numpy()

tweets, users = load_tweets(tweet_path)

tweets = tweets[0:500] #use first 500 for testing
users = users[0:500]

#remove old style retweets, hashtags and hyperlinks in a string tweet
def clean_tweet(tweet):
    tweet = re.sub(r'^RT[\s]+', '', tweet) 
    tweet = re.sub(r'#', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    return tweet

#remove punctuation and stopwords in a list of tokens
def clean_tweets(tweet_tokens):
    tweets_clean = []
    for word in tweet_tokens:
        if ((word not in stopwords_english) and (word not in string.punctuation)): # remove punctuation + stopwords
            tweets_clean.append(word)
    return tweets_clean

#tokenize and clean tweets
def tokenize(tweets):
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)
    count = 0
    tokens = [tokenizer.tokenize(clean_tweet(tweet)) for tweet in tweets]
    tokens = [clean_tweets(token) for token in tokens]
    return tokens

tokens = tokenize(tweets)
    
# ############################################################
# # Extracting and loading data
# ############################################################
class Dataset(Dataset):
    def __init__(self, X, y):
        self.len = len(X)           
        if torch.cuda.is_available():
          self.x_data = torch.from_numpy(X).float().cuda()
          self.y_data = torch.from_numpy(y).long().cuda()
        else:
          self.x_data = torch.from_numpy(X)
          self.y_data = torch.from_numpy(y).long()
    
    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        return self.x_data[idx], self.y_data[idx]
    


In [16]:
"""
    Randomly choose 20 percent of the training data as validation data.

    Args:
        x_train: training images in shape (num_images,3,image_H,image_W)
        y_train: training labels in shape (num_images,)
    Returns:
        new_x_train: training images in shape (0.8*num_images,3,image_H,image_W)
        new_y_train: training labels in shape (0.8*num_images,)
        x_val: validation images in shape (0.2*num_images,3,image_H,image_W)
        y_val: validation labels in shape (0.2*num_images,)
"""
"""
    Randomly choose 20 percent of the training data as validation data.

    Args:
        x_train: training images in shape (num_images,3,image_H,image_W)
        y_train: training labels in shape (num_images,)
    Returns:
        new_x_train: training images in shape (0.8*num_images,3,image_H,image_W)
        new_y_train: training labels in shape (0.8*num_images,)
        x_val: validation images in shape (0.2*num_images,3,image_H,image_W)
        y_val: validation labels in shape (0.2*num_images,)
"""
import random
import math

def create_validation(x_train,y_train):
    num_images = len(x_train)
    num_validation = math.floor(.2 * num_images)    
#     print(num_images)
    
    #indices of the validation set, random
    validation_indices = random.sample(range(0, num_images), num_validation)
    
    #get the indices of the training set 
    training_indices = []
    for i in range(0, num_images):
        if i not in validation_indices:
            training_indices.append(i)

    x_val = np.take(x_train, validation_indices)
    y_val = np.take(y_train, validation_indices)
    
    new_x_train = np.take(x_train, training_indices)
    new_y_train = np.take(y_train, training_indices)
        
    return new_x_train,new_y_train,x_val,y_val

new_x_train, new_y_train, x_val, y_val = create_validation(tweets, users)
# print(new_x_train)
# print(new_y_train)
# print(x_val)
# print(y_val)

["I'm excited to see @CecileRichards, @aliciagarza, @aijenpoo, and the Pantsuit Nation community build this powerful new force for gender equity. Supermajority's goal is to train and mobilize 2 million women over the next two years. https://t.co/vorkl8sMZG"
 'I condemn this act of domestic terrorism. It’s no accident that hate crimes are on the rise. We cannot turn a blind eye to the fact that the president’s embrace of white nationalist rhetoric, and other politicians’ lack of condemnation of this rhetoric, is fueling these attacks. https://t.co/hNxAsntady'
 'So, we did this.  Hillary Clinton Reads the Mueller Report - Klepper https://t.co/3g9tS9uudm via @YouTube'
 ...
 'In Democrat run San Francisco if you don’t clean up your dog’s poop, you’re fined $320.  However, if you defecate on the sidewalk, there’s no fine. You can also get free syringes and use heroin in public. But remember the neither you or your dog are allowed to use plastic straws! https://t.co/Y5CDG6igOG'
 "Good Mornin

In [17]:
############################################################
# Convolutional Neural Network
############################################################
class ConvolutionalNN(nn.Module):
    """ 
    (1) Use self.conv1 as the variable name for your first convolutional layer
        (2) Use self.pool1 as the variable name for your first pooling layer
        (3) Use self.conv2 as the variable name for your second convolutional layer
        (4) Use self.pool2 as the variable name for you second pooling layer  
        (5) Use self.fc1 as the variable name for your first fully connected layer
        (6) Use self.fc2 as the variable name for your second fully connected layer
    """
    
     # Hyper-parameters 
    input_size = 3
    hidden_size = 2000
    num_classes = 5
    num_epochs = 40
    batch_size = 64
    learning_rate = 0.001
    
    
    def __init__(self):
        super(ConvolutionalNN, self).__init__()
        self.conv1 = nn.Conv2d(self.input_size, 16, kernel_size=3, stride=1, padding=0)
        self.relu = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=0)  
        self.pool2 = nn.MaxPool2d(kernel_size=2)
        
        #reshape fc1 = C*H*W
        self.fc1 = nn.Linear(8512, 200)
        self.fc2 = nn.Linear(200, 5)

      
    def forward(self, x):
        out = self.conv1(x)
        out = self.relu(out)
        
        out = self.pool1(out)
        
        out = self.conv2(out)
        out = self.relu(out)
        
        out = self.pool2(out)
        
        #reshape out to be 146624
        (_, C, H, W) = out.data.size()
        out = out.view( -1 , C * H * W)
        out = self.fc1(out)
        out = self.relu(out)
        
        out = self.fc2(out)
        
        return out
      
    """ 
        Please do not change the functions below. 
        They will be used to test the correctness of your implementation
    """
    
    def get_conv1_params(self):
        return self.conv1.__repr__()
    
    def get_pool1_params(self):
        return self.pool1.__repr__()

    def get_conv2_params(self):
        return self.conv2.__repr__()
      
    def get_pool2_params(self):
        return self.pool2.__repr__()
      
    def get_fc1_params(self):
        return self.fc1.__repr__()
    
    def get_fc2_params(self):
        return self.fc2.__repr__()

In [9]:
"""
    Runs experiment on the model neural network given a train loader, loss function and optimizer and find validation accuracy for each epoch given the validation_loader.

    Args:
        neural_network (NN model that extends torch.nn.Module): For example, it should take an instance of either
                                                                FeedForwardNN or ConvolutionalNN,
        train_loader (DataLoader),
        validation_loader (DataLoader),
        loss_function (torch.nn.CrossEntropyLoss),
        optimizer (optim.SGD)
        num_epochs (number of iterations)
    Returns:
        tuple: First position, training accuracies of each epoch formatted in an array of shape (num_epochs,1).
               Second position, training loss of each epoch formatted in an array of shape (num_epochs,1).
               third position, validation accuracy of each epoch formatted in an array of shape (num_epochs,1).
               
"""

def train_val_NN(neural_network, train_loader, validation_loader, loss_function, optimizer, num_epochs):
    accuracy = np.empty((num_epochs,1))
    loss_np = np.empty((num_epochs,1))
    val_accuracy = np.empty((num_epochs,1))
    
    model = neural_network
    
    #train first 
    for epoch in range(num_epochs):
        #train on batch
        total_loss = 0
        for i, (images, labels) in enumerate(train_loader):
            # Forward pass
            outputs = model(images)
            loss = loss_function(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            #add loss
            total_loss += loss.item()
        
        #get validation accuracy for this epoch
        val_correct = 0
        val_total = 0
        for images, labels in validation_loader:
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()       
        
         #get training accuracy for this epoch
        train_correct = 0
        train_total = 0
        for images, labels in train_loader:
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        
        accuracy[epoch] = train_correct/train_total
        loss_np[epoch] = total_loss
        val_accuracy[epoch] = val_correct/val_total
        
    return (accuracy,loss_np,val_accuracy)

In [10]:
"""
    Runs experiment on the model neural network given a test loader, loss function and optimizer.

    Args:
        neural_network (NN model that extends torch.nn.Module): For example, it should take an instance of either
                                                                FeedForwardNN or ConvolutionalNN,
        test_loader (DataLoader), (make sure the loader is not shuffled)
        loss_function (torch.nn.CrossEntropyLoss),
        optimizer (your choice)
        num_epochs (number of iterations)
    Returns:
        your predictions         
"""
def test_NN(neural_network, test_loader, loss_function):
    model = neural_network
    with torch.no_grad():
        correct = 0
        total = 0
        Preds = []
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            Preds.append(predicted)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return Preds
# 
# 
# with open('HW4_preds.txt', 'w') as f:
#     for item in Preds:
#         f.write("%s\n" % item)

In [18]:
# Run Baseline CNN
from torch.utils.data import DataLoader

#initialize params
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
neural_network = ConvolutionalNN()
model = neural_network.to(device)
learning_rate = 0.001
batch_size = 64
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(neural_network.parameters(), lr=learning_rate)
num_epochs = 40


new_x_train, new_y_train, x_val, y_val = create_validation(tweets, users)

#load into dataloader/dataset
train_dataset =  Dataset(new_x_train, new_y_train)
validation_dataset = Dataset(x_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

accuracy,loss_np,val_accuracy = train_val_NN(neural_network, train_loader, validation_loader, loss_function, optimizer,num_epochs)
print(accuracy)
print(loss_np)
print(val_accuracy)

#plot per epoch
plt.figure()
plt.plot(accuracy, label='accuracy')
plt.show()
    
plt.figure()
plt.plot(loss_np, label='loss_np')
plt.show()
    
plt.figure()
plt.plot(val_accuracy, label='val_accuracy')
plt.show()

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: double, float, float16, int64, int32, and uint8.