In [1]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/umasreeram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/umasreeram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer




def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(stems)
    return " ".join(lemmas)




def clean_string(mystr):
    mystr=mystr.lower()
    mystr=re.sub(r"\\\w+", " ", mystr)
    mystr=re.sub(r"\@\w+"," ",mystr)
    mystr=re.sub(r"\#\w+"," ",mystr)
    mystr=re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"," ",mystr)
    
    mystr=mystr[2:-1]
    
    cleantext = "".join([x.lower() if (x.isalpha() or x.isspace()) else ' ' for x in mystr])
    
    return cleantext

def remove_stop_words(mystr):
    word_list= mystr.split()
    cleaned_word_list=[w for w in word_list if w not in stop_words and len(w)>=2]
    
    return cleaned_word_list
   
    

def join_words(mylist):
    return " ".join(mylist)


In [4]:

import os
goal_dir = os.path.join(os.getcwd(), "tweets_raw/")



for filename in os.listdir(goal_dir):
    if filename.endswith(".csv"): 
         print(filename)

tesla_tweets.csv
parenting_tweets.csv
ebay_tweets.csv
usedgov_tweets.csv
nytimes_tweets.csv
premierleague_tweets.csv
ladygaga_tweets.csv
MTV_tweets.csv
facebook_tweets.csv
FoodandTravelEd_tweets.csv


In [5]:
from numpy import asarray
from numpy import savetxt
import numpy as np
        
goal_dir = os.path.join(os.getcwd(), "tweets_raw/")

import os

vocabulary={}

for filename in os.listdir(goal_dir):
    if filename.endswith(".csv"): 
        file = pd.read_csv("tweets_raw/"+filename)
        file['clean_text']=file['text'].apply(clean_string)

        file['word list']=file['clean_text'].apply(remove_stop_words)
        #file['cleaned_word_list']=file['word list'].apply(stem_and_lemmatize)
        
        file['word list']=file['word list'].apply(join_words)
        

        file.to_csv(r'cleaned_csv/clean_'+filename, encoding='utf-8')




        vectorizer = TfidfVectorizer()
        tfidfmatrix = vectorizer.fit_transform(file['word list'])

        vocab = vectorizer.get_feature_names()
        tfidf_data=tfidfmatrix.toarray()
        vocabulary[filename]=vocab
        
        tfidf_pd=pd.DataFrame(data=tfidf_data,columns=vocab,index=file['id'])
        
        tfidf_pd.to_csv(r'tfidf_matrices/tfidf_'+filename, encoding='utf-8')
        



In [6]:
#print(vocabulary)
vocab_length={}
for x in vocabulary:
    vocab_length[x]=len(vocabulary[x])

In [7]:
print(vocab_length)

{'tesla_tweets.csv': 4597, 'parenting_tweets.csv': 2002, 'ebay_tweets.csv': 5505, 'usedgov_tweets.csv': 5356, 'nytimes_tweets.csv': 10313, 'premierleague_tweets.csv': 3643, 'ladygaga_tweets.csv': 5013, 'MTV_tweets.csv': 3966, 'facebook_tweets.csv': 3700, 'FoodandTravelEd_tweets.csv': 6607}
