# CIS6930 Week 5: NLP Basics & Word Embeddings (Student version)

---

This notebook does not use GPU. :)



## spaCy Quick Tutorial

In [None]:
!pip install spacy

In [None]:
# python -m spacy download en
import spacy.cli
spacy.cli.download("en_core_web_sm") # en_core_web_lg

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print([token.text, token.lemma_, token.pos_, token.dep_, token.ent_iob_, token.ent_type_])

In [None]:
token.lemma_

In [None]:
# Named entity extraction (NER)
for ent in doc.ents:
    print([ent.text, ent.start_char, ent.end_char, ent.label_])

In [None]:
doc.vector

In [None]:
token.vector

### Why not NLTK? NLTK's Known Pitfall

In [None]:
import nltk
nltk.download("punkt")  # "all" to download

In [None]:
sentence = "At eight o'clock on Thursday morning, Arthur didn't feel very good."
nltk.word_tokenize(sentence)

In [None]:
nltk.word_tokenize("It was eight o'clock on Thursday morning.Arthur didn't feel very good.")
#                                                          ^^^ No white space

## Pandas 101



In [None]:
# https://www.kaggle.com/crowdflower/twitter-airline-sentiment
# License CC BY-NC-SA 4.0
!gdown --id 1BS_TIqm7crkBRr8p6REZrMv4Uk9_-e6W

In [None]:
!ls

In [None]:
import pandas as pd

df = pd.read_csv("Tweets.csv")
df.head()

In [None]:
df.columns

In [None]:
df[["airline_sentiment", "airline"]].head()

### Quick Analysis with Pandas DataFrame

In [None]:
agg_df = df.groupby(["airline_sentiment", "airline"]).size().unstack(0)
agg_df

In [None]:
agg_df.sum(axis=1)

In [None]:
agg_df.div(agg_df.sum(axis=1), axis=0)

In [None]:
agg_df.div(agg_df.sum(axis=1), axis=0).plot(kind="bar")

### Data preparation for Text classification

In [None]:
subdf = df[["airline_sentiment", "text"]]
subdf.head()

In [None]:
subdf["text"].head().iloc[1]

In [None]:
subdf["airline_sentiment"].value_counts()

In [None]:
# Label encoding & Tokenization

In [None]:
# Non-NN solution
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

vectorizer = CountVectorizer(stop_words="english")  # ["text1", "text2"] -> X
X = vectorizer.fit_transform(subdf["text"].tolist())

le = LabelEncoder()
y = le.fit_transform(subdf["airline_sentiment"].values)
y

In [None]:
y.shape

In [None]:
# Check the vocabulary
len(vectorizer.vocabulary_)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.20,
                                                    random_state=42)

# Dimensionality reduction using SVD: V -> Z (e.g., 500)
#from sklearn.decomposition import TruncatedSVD
#svd = TruncatedSVD(n_components=500)
#X_train = svd.fit_transform(X_train)
#X_test = svd.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train, y_train)

print("Train acc: {:.4f}".format(clf.score(X_train, y_train)))
print(" Test acc: {:.4f}".format(clf.score(X_test, y_test)))

### Creating confusion matrix using Pandas

In [None]:
pred_df = pd.DataFrame({"pred": clf.predict(X_test),
                        "true": y_test})

In [None]:
pred_df.groupby(["pred", "true"]).size().unstack(0)

In [None]:
le.classes_

## spaCy + Pandas: Clustering Tweets by Word Embeddings

In [None]:
import numpy as np
import seaborn as sns
from sklearn.manifold import TSNE 

tsne = TSNE(n_components=2)

sample_df = df.sample(n=2000, random_state=1) # Try increasing the number of samples and run the same analysis
embs = np.array(sample_df["text"].apply(lambda x: nlp(x).vector).tolist())
embs2d = tsne.fit_transform(embs)

emb_df = pd.DataFrame({"x1": embs2d[:, 0],
                       "x2": embs2d[:, 1],
                       "sentiment": sample_df["airline_sentiment"],
                       "airline": sample_df["airline"]})

emb_df.head()

In [None]:
sns.scatterplot(x="x1",
                y="x2",
                s=3,
                hue="sentiment",
                data=emb_df)

In [None]:
sns.scatterplot(x="x1",
                y="x2",
                s=3,
                hue="airline",
                data=emb_df)