I have done a basic analysis of the tweets for the most frequent words, most common locations and subjects of the tweets. Visualizations shown below. Suggestions are welcome.

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
#import tweets 

isis = pd.read_csv('../input/tweets.csv')
print(isis.head())
print()
print(isis.describe())

In [None]:
# Step 1. Basic Analysis

# 1.1 most frequent words used in tweets
## I will preprocess all the tweets to lowercase, remove stopwords such as the, in etc and also stem the words. Also, I wil try to separate hashtags to individual words wherever possible eg. 
## #AmazingDay ---> amazing day

def preprocess(tweet):
    # A number of the tweets start with ENGLISH TRANSLATIONS: so i will remove it 
    tweet = re.sub(r'ENGLISH TRANSLATION:','',tweet)
    #I will also strip the tweets of non-alphabetic characters except #
    tweet = re.sub(r'[^A-Za-z# ]','',tweet)
    
    words = tweet.strip().split()
  
    hashtags = [word for word in words if re.match(r'#',word)!=None]
    words = [word.lower() for word in words if word not in hashtags]
    
    # remove stopwords and stem words using porter stemmer
    p_stem = PorterStemmer()
    words = [p_stem.stem(word.lower()) for word in words if word not in stopwords.words('english')]
    
    for hashtag in hashtags:
        hashtag = re.sub(r'#',hashtag,'')
        words_tag = []
        current_word = ''
        for a in hashtag:
            if a.isupper() and current_word!='':
                words_tag.append(current_word)
                current_word = ''+ a.lower()
            else:
                current_word = current_word + a.lower()
        words_tag.append(current_word)
        words.extend(words_tag)
    words = list(set(words))
    return words

# using the above function I will add another column "wordlist" to the dataframe

isis['wordlist'] = [preprocess(tweet) for tweet in isis['tweets']]

In [None]:
#Plot of frequency of various words used in the tweets

all_words = [word for wordlist in isis['wordlist'] for word in wordlist]
length_all = len(all_words)
wordcount = dict([(word,all_words.count(word)) for word in set(all_words)])
print(length_all)


In [None]:
import operator
wordcount = sorted(wordcount.items(), key = operator.itemgetter(1))
wordcount.reverse()

%matplotlib inline

import matplotlib.pyplot as plt

#plotting the top 20 most frequent words

wordcount = wordcount[2:] #since first two words are '' and 'rt'
top20 = wordcount[:20]
top20_words = [word for (word,count) in top20]
top20_freq = [count for (word,count) in top20]
indexes = np.arange(len(top20_words))
width = 0.7
plt.figure(figsize=(15,15))
plt.bar(indexes, top20_freq, width)
plt.xticks(indexes + width/2 , top20_words)
plt.show()

In [None]:
# location analysis 
unique_locations = isis['location'].unique()
unique_counts = dict([(loc,list(isis['location']).count(loc)) for loc in unique_locations])
unique_counts = sorted(unique_counts.items(),key = operator.itemgetter(1))
unique_counts.reverse()
for (loc,counts) in unique_counts:
    print(loc,counts)

In [None]:
# subject of tweet analysis using pos tagging

def tweet_subject(tweet):
    tweet = re.sub('ENGLISH TRANSLATION:','',tweet)
    tweet = re.sub('ENGLISH TRANSLATIONS:','',tweet)
    tokenized = nltk.word_tokenize(tweet.lower())
    tagged = nltk.pos_tag(tokenized)
    nouns = [(word) for (word,tag) in tagged if re.match(r'NN',tag)!=None]
    return nouns


In [None]:
isis['tweet_subjects'] = [tweet_subject(tweet) for tweet in isis['tweets']]
#most frequent sujects
all_subjects = [word for wordlist in isis['tweet_subjects'] for word in wordlist]
all_subjects_counts =dict([(word,all_subjects.count(word)) for word in set(all_subjects) ])
all_subjects_counts = sorted(all_subjects_counts.items(), key = operator.itemgetter(1))
all_subjects_counts.reverse()
print('TOTAL UNIQUE SUBJECTS : ', len(all_subjects_counts))
for (a,b) in all_subjects_counts[:30]:
    print(a,b)

In [None]:
#plotting the top 20 most frequent words

top20_sub = all_subjects_counts[:20]
top20_words = [word for (word,count) in top20_sub]
top20_freq = [count for (word,count) in top20_sub]
indexes = np.arange(len(top20_words))
width = 0.7
plt.figure(figsize=(20,20))
plt.bar(indexes, top20_freq, width)
plt.xticks(indexes + width/2 , top20_words)
plt.show()