# Finding Keywords

## RAKE! Keywords Testing

In [18]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ksrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ksrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
from rake_nltk import Rake

test_input = "RAKE (Rapid Automatic Keyword Extraction) is a python library that can be used to extract the main keywords in a text."
r = Rake()
r.extract_keywords_from_text(test_input)

keywords_list = r.get_ranked_phrases()

# Convert the list of keywords into a string
keywords_str = "\n".join(keywords_list)

print(keywords_str)

rapid automatic keyword extraction
python library
main keywords
used
text
rake
extract


## Applying Keywords to the Data frame

In [14]:
# Applying Keywords to the Data frame

from rake_nltk import Rake
import pandas as pd
from collections import Counter
import numpy as np

data = pd.read_csv('C:\Program Files (x86)\Visual Studio Code\Python\Masters_Project\Data Sets\Trip-Advisor-rating-sentiments.csv')

def extract_keywords(text):
    rake = Rake()
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

data["keywords"] = data["Review"].apply(extract_keywords)

words = []
for x in data.keywords.values:
    words.extend(x)

## You can remove duplicates using set().
words = list(set(words))

## Using Python Counter

In [15]:
# Create a list called 'words' that will store all the unique keywords extracted from the reviews
words = []
for review_keywords in data['keywords']:
    words.extend(review_keywords)

# Create a counter that will count the frequency of each word
word_counter = Counter(words)

# Create a new data frame that contains the unique keywords and their counts
word_df = pd.DataFrame(np.array(list(word_counter.items())), columns=["keyword", "count"])

## Top 10 Keywords

In [None]:
word_df["count"] = word_df["count"].astype(int)

# Sort values by count in descending order
word_df = word_df.sort_values(["count"], ascending=False)

# Select top 20 words
top_20 = word_df[0:19]

# Display the first 10 words in word_df
word_df.head(10)

## Bar plot of Top 20 Keywords

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(rc={'figure.figsize':(15,8)})
fig, ax = plt.subplots()

# Color palette for bar plot
sns.set_palette("husl", 9)

# Plot the bar plot
ax = sns.barplot(data=top_20,x="keyword",y="count")

# Set the title
ax.set_title("Top 20 Keywords",fontsize=20)

# Set the visibility of the patch
ax.patch.set_visible(False)

# Rotate the x-axis labels
ax.tick_params(axis='x', labelrotation = 45)

# Show the plot
plt.show()