# Sentence Embeddings

Run this code to get the sentence embeddings for the script data. The file is too large for github, so you can generate it here.

In [2]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import re
import numpy as np

In [3]:
# open dataframe
complete = pd.read_csv('https://scmcqueen.github.io/StarTrekScriptData/complete_data.csv')

# get sentences
sentences = list(complete['quote'])

# preview
sentences[:5]

["   You know, Morn -- there's nothing    quite as invigorating as breakfast    in a bar. Where else can you get    raw slug livers first thing in the    morning?",
 "   What's this?",
 '   What do you mean, "what\'s this?"    It\'s puree of beetle.',
 "   I didn't order it.",
 '   Of course you "didn\'t order it" --    you don\'t need to order it. You    have it after work every morning.']

In [4]:
# load my girl roberta
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

In [5]:
embeddings = model.encode(sentences)
# takes 4 minutes to run

In [None]:
embedding_df = pd.DataFrame(embeddings)
# embedding_df.to_csv('st_embeddings.csv')
embedding_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.0149,-0.071977,-0.009478,-0.029031,0.014188,0.016394,0.027681,-0.037291,-0.006358,-0.004826,...,0.00111,0.012414,0.030341,-0.003418,-0.008822,0.033325,-0.001755,0.06869,0.02025,0.039753
1,-0.007164,-0.013978,-0.013652,0.003309,0.034851,0.060882,0.048588,0.017505,0.007778,-0.010418,...,0.022243,0.049582,-0.00842,-0.030691,-0.000645,-0.05014,0.083585,-0.046941,-0.02003,0.012742
2,-0.049198,-0.035306,0.008143,0.088346,0.093932,0.025349,-0.01051,-0.015539,-0.002362,-0.013975,...,0.003785,-0.010067,0.007732,-0.049809,0.05156,0.024296,0.035107,-0.066092,-0.020333,-0.009799
3,-0.04359,-0.021896,-0.019405,0.027361,-0.020507,0.03543,-0.040713,0.043625,0.065613,0.002796,...,0.009547,0.010854,-0.027765,0.049275,0.023973,0.004445,0.07236,-0.029266,-0.031327,0.004318
4,-0.053526,0.001219,0.001101,0.03138,-0.008443,0.052395,-0.043175,-0.038514,-0.012206,0.025206,...,-0.040711,0.010254,0.007623,0.042096,-0.003285,0.00207,0.003425,-0.044013,0.021388,0.001928


### Test LambdaRank with Embeddings only!

In [15]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
from spacy.tokens import Doc
import csv
import altair as alt

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [22]:
# read in ratings
ratings = pd.read_csv('skyeler_ranking_data.csv',index_col=0)
ratings.drop(columns=['Unnamed: 0','quote'],inplace=True)
ratings.columns=['index','query','ranking']
ratings.ranking = ratings.ranking.apply(lambda z: ratings.ranking.max()+1 if z < 0 else z)
ratings.head()

Unnamed: 0,index,query,ranking
0,113689,lwaxana,21
1,113558,lwaxana,21
2,113532,lwaxana,21
3,55180,lwaxana,21
4,55060,lwaxana,21


In [None]:
# get index column for embeddings
embedding_df=embedding_df.reset_index()

In [None]:
# join the data
ratings_embeddings = ratings.merge(embedding_df,on=['index'],how='left')
ratings_embeddings.head()

Unnamed: 0,index,query,ranking,0,1,2,3,4,5,6,...,758,759,760,761,762,763,764,765,766,767
0,113689,lwaxana,21,0.018398,0.012873,-0.024242,0.002963,0.049414,0.09395,-0.004741,...,0.00988,-0.042633,-0.056167,0.008068,0.013,-0.046821,0.00999,0.005621,0.028357,0.019484
1,113558,lwaxana,21,0.016721,0.024398,-0.030568,-0.013274,0.042874,0.059577,0.034213,...,0.011101,-0.056366,-0.052283,0.000956,0.000591,-0.030117,-0.00134,4.5e-05,0.005712,0.018245
2,113532,lwaxana,21,0.016721,0.024398,-0.030568,-0.013274,0.042874,0.059577,0.034213,...,0.011101,-0.056366,-0.052283,0.000956,0.000591,-0.030117,-0.00134,4.5e-05,0.005711,0.018245
3,55180,lwaxana,21,0.018398,0.012873,-0.024242,0.002963,0.049414,0.09395,-0.004741,...,0.00988,-0.042633,-0.056167,0.008068,0.013,-0.046821,0.00999,0.005621,0.028357,0.019484
4,55060,lwaxana,21,0.018398,0.012873,-0.024242,0.002963,0.049414,0.09395,-0.004741,...,0.00988,-0.042633,-0.056167,0.008068,0.013,-0.046821,0.00999,0.005621,0.028357,0.019484


In [26]:
# train test split my data
train, test = train_test_split(ratings_embeddings,test_size=.15,random_state=56)
# sort my train values
train = train.sort_values('query')
test = test.sort_values('query')

In [31]:
train_y = list(train['ranking'])
train_groups = train['query'].value_counts().reset_index().sort_values('query')['count'].values
train_X = train.drop(columns=['index','query','ranking'])

In [36]:
test_y = list(test['ranking'])
test_groups = test['query'].value_counts().reset_index().sort_values('query')['count'].values
test_X = test.drop(columns=['index','query','ranking'])

In [35]:
# train_X is already prepared bc of the vectors!

In [38]:
train_data =lgb.Dataset(train_X, label=train_y, group=train_groups,params={'min_data_in_leaf':1})
valid_data = lgb.Dataset(test_X,label=test_y,group=test_groups)

In [39]:
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'learning_rate': 0.01,
    'num_leaves': 19, # 31
    'task':'train',
    # 'feature_pre_filter':False,
    "num_leaves": 255,
    "feature_pre_filter": False,
#   "min_data_in_leaf": 1,
    'max_depth':40,
    'verbose':-1
    # 'max_depth':-1
}
res = {}

In [40]:
ranker = lgb.train(params, train_data, num_boost_round=250,valid_sets=[valid_data])

In [41]:
ranker.best_score

defaultdict(collections.OrderedDict,
            {'valid_0': OrderedDict([('ndcg@1', 0.8278325829572262),
                          ('ndcg@2', 0.8748111947107955),
                          ('ndcg@3', 0.9000711937286457),
                          ('ndcg@4', 0.9101365714504701),
                          ('ndcg@5', 0.9225574551310549)])})