# Keyword Extraction using Machine Learning
I have the task of Natural language processing that automatically identifies a set of terms to describe the subject of the text. This is an important method in information retrieval (IR) systems: keywords simplify and speed up research. Keyword extraction can be used to reduce text dimensionality for further text analysis (subject modeling text classification).
The task of keyword extraction can be used in automatically indexing data, summarizing text, or generating tag clouds with the most representative keywords.

Here is my Machine Learning project on Keyword Extraction with Python programming language. 

### Step 1: Import Necessary Libraries and Dataset

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('./data/papers.csv')

In [None]:
df.head()

In [None]:
df.tail()

#### Observations: I see that the dataset contains the id, year, title and more. However, the columns such as event type and abstract seem to be missing.

### Step 2: Preprocess textual data
I will use NLTK library in Python

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#nltk.download()

In [None]:
nltk.download('stopwords')

#### Create a lost of custom stopwords

In [None]:
stop_words = set(stopwords.words('english'))
##Creating a list of custom stopwords
new_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]
stop_words = list(stop_words.union(new_words))

#### Created a preprocessing function used in a keyword extraction project, designed to clean and normalize text data before analyzing 

In [None]:
def pre_process(text):
    #Lowercase Conversion 
    #converts all characters in the input text to lowercase. 
    text = text.lower()
    
    #Remove HTML Tags
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    #Remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    #Convert to list from string
    text = text.split()
    
    # remove stopwords
    #Explanation: Stopwords are common words like "the", "is", "in", 
    #which are generally considered irrelevant for keyword extraction 
    #because they do not contribute significantly to the text's main topics.
    text = [word for word in text if word not in stop_words]
    
    # remove words less than three letters
    text = [word for word in text if len(word) >= 3]
    
    #Lemmatize
    #Clarification: Lemmatization is the process of reducing words to their base or dictionary form (lemma). 
    # A WordNetLemmatizer is used here to lemmatize each word in the list. 
    # This step is important for keyword extraction because it helps in recognizing different forms of the 
    # same word as a single keyword (e.g., "runs", "running", and "ran" would all be converted to "run")
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]
    
    return ' '.join(text)
docs = df['paper_text'].apply(lambda x:pre_process(x))
docs = docs.tolist()


In [None]:
# Create a vocabulary of words,
cv = CountVectorizer(max_df=0.95,
                     #ignore words that appear in 95% of documents
                     max_features = 10000, # the size of the vocabulary
                     ngram_range = (1,3)
                    )
word_count_vector = cv.fit_transform(docs)

In [None]:
word1 = "abcd"
word2 = "pq"

In [None]:
def mergeAlternately(word1: str, word2: str):
    if len(word1) == 0:
        return word2
    
    elif len(word2) == 0:
        return word1
    
    elif len(word1) == len(word2):
        merged = []
        for i in range(len(word1)):
            merged.append(word1[i])
            merged.append(word2[i])
    elif len(word1) < len(word2):
        merged = []
        for i in range(len(word1)):
            merged.append(word1[i])
            merged.append(word2[i])
        merged.append(word2[len(word1):len(word2)])
    else:
        merged = []
        for i in range(len(word2)):
            merged.append(word1[i])
            merged.append(word2[i])
        merged.append(word1[len(word2):len(word1)])
    print(merged)
    print(''.join(merged))
    print(type(merged))

In [None]:
mergeAlternately(word1, word2)

In [None]:
str4 = str2*2
str4

In [None]:
str3 = str2+str2

In [None]:
str3

In [None]:
for i in range(6, 0,-2):
    print(i, end=" ")

In [None]:
a = 'AAAAAAAAAA'
b = 'AAAAA'

In [None]:
c = b[3:]

In [None]:
c

In [None]:
b[0]

In [None]:
len(b)

In [None]:
b[0]*len(a) == a

In [None]:
b[:4]*(len(b)//4)

In [None]:
b[:4]

In [None]:
b[4:8]

In [None]:
b[8:]

In [None]:
len(str1)%len(str2) == 0

In [None]:
len(str2)//2

In [None]:
len(str1)%len(str2) == 0

In [None]:
ls =[]
if len(str2) < len(str1):
    if str1 == str2+str2:
        ls = str2
    elif 1 == 1:
        #first loop i = 6
        for i in range(len(str2),0,-2):
            if str2[:i]*(len(str2)//i) == str2:
                ls = str2[:i]
            else:
                continue
    elif (str2[0]*len(str2) == str2) & (str2[0]*len(str1) == str1):
        ls = str1[0]
    else:
        ls=[]
print(ls)

In [None]:
a = 1
b = 2

In [None]:
a = b
a

In [None]:
c = a

In [None]:
b = c
b

In [None]:
str1 = str2

In [None]:
str1

In [None]:
str3 = str1

In [None]:
str2 = str1

In [None]:
str1

In [None]:
str1 == str2+str2

In [None]:
str1 = "NLZGMNLZGMNLZGMNLZGMNLZGMNLZGMNLZGMNLZGM"
str2 = "NLZGMNLZGMNLZGMNLZGMNLZGMNLZGMNLZGMNLZGMNLZGM"

In [None]:
str1

In [None]:
for i in range(len(str1),0,-1):
    if str2[:i]*(len(str2)//i) == str2:
        print(i)
        if str2[:i]*(len(str1)//i) == str1:
            print(i)
            ls = str2[:i]
            print(ls)

In [None]:
str2[:3]*(len(str2)//3)

In [None]:
ls =''
if len(str1) == len(str2):
    if str1 == str2:
        ls = str1
    elif (str2[0]*len(str2) == str2) & (str2[0]*len(str1) == str1):
            ls = str1[0]
    else:
        ls=''
        
elif len(str2) > len(str1):
    str3 = str2
    str2 = str1
    str1 = str3
    if len(str2) < len(str1):
        if str1 == str2+str2:
            ls = str2
        elif True:
            for i in range(len(str2),0,-2):
                if str2[:i]*(len(str2)//i) == str2:
                    if str2[:i]*(len(str1)//i) == str1:
                        ls = str2[:i]
                        print(ls)
                    else:
                        continue
                else:
                    continue
        elif (str2[0]*len(str2) == str2) & (str2[0]*len(str1) == str1):
            ls = str1[0]
        else:
            ls=''
else:
        if str1 == str2+str2:
            ls = str2
        elif True:
            for i in range(len(str2),0,-1):
                if str2[:i]*(len(str2)//i) == str2:
                    if str2[:i]*(len(str1)//i) == str1:
                        ls = str2[:i]
        elif (str2[0]*len(str2) == str2) & (str2[0]*len(str1) == str1):
            ls = str1[0]
        else:
            ls=''
print(ls)