In [1]:
import numpy as np
import cvxpy as cp
from sklearn.feature_extraction.text import CountVectorizer

# Tweet preprocessing

Tweets obtained using the code from [this repository](https://github.com/rnithin1/download-tweets/blob/master/why-does-twitter-make-me-do-this.py)

In [2]:
import pandas as pd
import string
import re

In [3]:
df = pd.read_csv("realdonaldtrump.csv")['0']

In [4]:
df = df.apply(lambda x: re.sub(r'http\S+', '', x))                                               # Get rid of URLs
df = df.apply(lambda x: re.sub(r'RT', '', x))                                                      # Get rid of RT
df = df.apply(lambda x: ' '.join(filter(lambda y: y[0] != '@', x.split())))             # Get rid of @'s
df = df.apply(lambda x: x.translate(str.maketrans('', '', string.digits)))             # Get rid of numbers
df = df.apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))                  # Get rid of emojis
df = df.apply(lambda x: x.strip())                                                                     # Get rid of excess whitespace
df = df.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))   # Get rid of punctuation
df.replace('', np.nan, inplace=True)                                                                 # Drop empty strings
df.dropna(inplace=True)
df = df.reset_index()['0']

In [5]:
corpus = df.values
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

# Second-Order Cone Program

Now that we have our processed tweets in the form of an array, we'll try to solve the following convex optimization problem using CVXPY (which can be posed as an SOCP):

$\text{arg min}_W \Vert X - XW^\top \Vert_F : \sum_{i=1}^{n} \Vert w_i \Vert_2 \leq \kappa$

Where $\kappa$ is a hyperparameter close to zero, to encourage many of W's columns to be zero.

In [6]:
n, m = X.toarray().shape
print((n, m))

(965, 3011)


In [7]:
kappa = cp.Variable()
kappas = [cp.Variable() for _ in range(m)]
W = cp.Variable((m, m))

In [8]:
soc_constraints = []
soc_constraints.append(kappa == 1)
soc_constraints.append(sum(kappas) <= kappa)
soc_constraints.extend([cp.SOC(kappas[i], W.T[i]) for i in range(m)])

In [12]:
prob = cp.Problem(cp.Minimize(cp.norm(X - X @ W.T, "fro")), soc_constraints)

In [13]:
prob.solve()

TypeError: Array of type 'double' required.  Array of type 'long long' given