#Hamilton: An American Musical 

Hamilton is a sung-and-rapped-through musical by Lin-Manuel Miranda. It tells the story of American Founding Father Alexander Hamilton.

Miranda said he was inspired to write the musical after reading the 2004 biography Alexander Hamilton by Ron Chernow. The show draws heavily from hip hop, as well as R&B, pop, soul, and traditional-style show tunes; and casts non-white actors as the Founding Fathers and other historical figures. Miranda described Hamilton as about "America then, as told by America now". https://en.wikipedia.org/wiki/Hamilton_(musical)

![](https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcT2X_MMJ0VnlzU1xQvBMYEzSzxZENQ8-9sD6g&usqp=CAU)genius.com

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#Codes by Erdal Nayir https://www.kaggle.com/erdal002/natural-language-processing/comments

In [None]:
import re # regular expression libary.
import nltk # Natural Language toolkit
nltk.download("stopwords")  #downloading stopwords
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
nltk.download('wordnet')
import nltk as nlp

In [None]:
from colorama import Fore, Style

nRowsRead = 1000 # specify 'None' if want to read whole file
# ham_lyrics.csv has 3634 rows in reality, but we are only loading/previewing the first 1000 rows
df = pd.read_csv('../input/hamilton-lyrics/ham_lyrics.csv', delimiter=',', nrows = nRowsRead)
df.dataframeName = 'ham_lyrics.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')
df.head()
print(Fore.MAGENTA + 'Data shape: ',Style.RESET_ALL,df.shape)
df

In [None]:
df.isnull().sum()

#Data Preproccesing,cleaning.

In [None]:
ham_list=[]

for ham in df.lines:
    ham=re.sub("[^a-zA-z]"," ",ham) # if expression in the sentence is not a word then this code change them to space
    ham=ham.lower() # turns all word in the sentence into lowercase.
    ham=nltk.word_tokenize(ham) # splits the words that are in the sentence from each other.
    lemma=nlp.WordNetLemmatizer()
    ham=[lemma.lemmatize(word) for word in ham] # this code finds the root of the word for a word in the sentence and change them to their root form.
    ham=" ".join(ham)
    ham_list.append(ham) # store sentences in list

#Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer #Bag of Words

max_features=500 # "number" most common(used) words in ham

count_vectorizer=CountVectorizer(max_features=max_features,stop_words="english") # stop words will be dropped by stopwords command

sparce_matrix=count_vectorizer.fit_transform(ham_list).toarray()# this code will create matrix that consist of 0 and 1.

In [None]:
sparce_matrix.shape 

Top 2000 the most used word and 1000 sentences that are in review.

As you see columns of the matrix consist of the most common words and rows of the matrix consist of sentences

In [None]:
sparce_matrix

In [None]:
print("Top {} the most used word in Hamilton Lyrics: {}".format(max_features,count_vectorizer.get_feature_names()))

In [None]:
data=pd.DataFrame(count_vectorizer.get_feature_names(),columns=["Words"])

In [None]:
data.head()

In [None]:
from wordcloud import WordCloud 
import matplotlib.pyplot as plt
plt.subplots(figsize=(12,12))
wordcloud=WordCloud(background_color="white",width=1024,height=768).generate(" ".join(data.Words[100:]))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
df.speaker.value_counts()

In [None]:
X=sparce_matrix
y=df.speaker

In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,log_loss,precision_score
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import roc_auc_score,roc_curve


X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print("x_train",X_train.shape)
print("x_test",X_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)

#LightGBM Classifier

In [None]:
from sklearn.svm import SVC

lgbm_model=LGBMClassifier()

lgbm_model.fit(X_train,y_train)

In [None]:
y_pred=lgbm_model.predict(X_test)

In [None]:
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred,average="micro"))

#XGBM classifier

In [None]:
xgb=XGBClassifier()
xgb_model=xgb.fit(X_train,y_train)

In [None]:
y_pred=xgb_model.predict(X_test)


print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred,average="micro"))

#Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB

nb=GaussianNB()
nb2=BernoulliNB()

nb_model=nb.fit(X_train,y_train)
nb2_model=nb2.fit(X_train,y_train)
y_pred=nb_model.predict(X_test)
y_pred2=nb2_model.predict(X_test)


print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred,average="micro"))
print("**************************************************************")
print("Accuracy_NB2:",accuracy_score(y_test, y_pred2))
print("Precision_NB2:",precision_score(y_test, y_pred2,average="micro"))

#RandomForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier


rf_model=RandomForestClassifier(random_state=42)
rf_model.fit(X_train,y_train)

In [None]:
y_pred=rf_model.predict(X_test)

print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred,average="micro"))

In [None]:
#Code by Olga Belitskaya https://www.kaggle.com/olgabelitskaya/sequential-data/comments
from IPython.display import display,HTML
c1,c2,f1,f2,fs1,fs2=\
'#2B3A67','#42a7f5','Akronim','Smokum',30,15
def dhtml(string,fontcolor=c1,font=f1,fontsize=fs1):
    display(HTML("""<style>
    @import 'https://fonts.googleapis.com/css?family="""\
    +font+"""&effect=3d-float';</style>
    <h1 class='font-effect-3d-float' style='font-family:"""+\
    font+"""; color:"""+fontcolor+"""; font-size:"""+\
    str(fontsize)+"""px;'>%s</h1>"""%string))
    
    
dhtml('Programming is more than an important practical art. It is also a gigantic undertaking in the foundations of knowledge, Grace Hopper quote' )