## Importing modules

In [None]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

## Downloading modules

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Removing stopwords

In [None]:
#An exmaple text message is given
text = 'My name is shruthi, working at Tata Consultancy Services, from 3 years.'

In [None]:
#Convert the text into lower case for uniformity
text1 = text.lower()
print(text1)

my name is shruthi, working at tata consultancy services, from 3 years.


In [None]:
#Split the text
#In this split(), the punctuations are no splitted
text1 = text1.split()
print(text1)

['my', 'name', 'is', 'shruthi,', 'working', 'at', 'tata', 'consultancy', 'services,', 'from', '3', 'years.']


In [None]:
#Check the list of words and eliminate the stop words using stopwords()
text1 = [i for i in text1 if i not in stopwords.words('english')]
print(text1)

['name', 'shruthi,', 'working', 'tata', 'consultancy', 'services,', '3', 'years.']


## Tokenisation

In [None]:
#Convert the text into lower case for uniformity
text2 = text.lower()
print(text2)

my name is shruthi, working at tata consultancy services, from 3 years.


In [None]:
#Instead of using split(), we can use word_tokenize() to split the data
#In this word_tokenize(), the punctuations are also splitted not as split()
text2 = word_tokenize(text2)
print(text2)

['my', 'name', 'is', 'shruthi', ',', 'working', 'at', 'tata', 'consultancy', 'services', ',', 'from', '3', 'years', '.']


In [None]:
#Check the list of words and eliminate the stop words using stopwords()
text2 = [i for i in text2 if i not in stopwords.words('english')]
print(text2)

['name', 'shruthi', ',', 'working', 'tata', 'consultancy', 'services', ',', '3', 'years', '.']


## Bag of words model

In [None]:
#In order to feed the data to ML model, we need to convert the non-numerical values to numerical values
#For that we need to use CountVectorizer() function
vectorizer = CountVectorizer()
mat = vectorizer.fit_transform(text2)
print(mat)    # returns the rown and column number of cells which have 1 as value

  (0, 1)	1
  (1, 3)	1
  (3, 5)	1
  (4, 4)	1
  (5, 0)	1
  (6, 2)	1
  (9, 6)	1


In [None]:
print(mat.toarray())

[[0 1 0 0 0 0 0]
 [0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0]
 [0 0 0 0 1 0 0]
 [1 0 0 0 0 0 0]
 [0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0]]


In [None]:
print(mat.shape)
print(vectorizer.get_feature_names_out())

(11, 7)
['consultancy' 'name' 'services' 'shruthi' 'tata' 'working' 'years']


## Stemming

In [None]:
stemmer = PorterStemmer()
porter_stem = [stemmer.stem(i) for i in text2]
print(porter_stem)

['name', 'shruthi', ',', 'work', 'tata', 'consult', 'servic', ',', '3', 'year', '.']


In [None]:
stemmer = SnowballStemmer('english')
snowball_stem = [stemmer.stem(i) for i in text2]
print(porter_stem)

['name', 'shruthi', ',', 'work', 'tata', 'consult', 'servic', ',', '3', 'year', '.']


In [None]:
df = pd.DataFrame({'token': text2, 'porter_stemmed': porter_stem, 'snowball_stemmed': snowball_stem})
#df = df[['token', 'porter_stemmed', 'snowball_stemmed']]

In [None]:
print(df)

          token porter_stemmed snowball_stemmed
0          name           name             name
1       shruthi        shruthi          shruthi
2             ,              ,                ,
3       working           work             work
4          tata           tata             tata
5   consultancy        consult          consult
6      services         servic           servic
7             ,              ,                ,
8             3              3                3
9         years           year             year
10            .              .                .


## Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(i) for i in text2]
print(lemmatized)

['name', 'shruthi', ',', 'working', 'tata', 'consultancy', 'service', ',', '3', 'year', '.']


In [None]:
dict1 = {'token': text2, 'Stemmed': porter_stem, 'lemmatized': lemmatized}
df1 = pd.DataFrame(dict1)
print(df1)

          token  Stemmed   lemmatized
0          name     name         name
1       shruthi  shruthi      shruthi
2             ,        ,            ,
3       working     work      working
4          tata     tata         tata
5   consultancy  consult  consultancy
6      services   servic      service
7             ,        ,            ,
8             3        3            3
9         years     year         year
10            .        .            .


In [None]:
vectorizer = TfidfVectorizer()
mat1 = vectorizer.fit_transform(text2)
print(mat1)

  (0, 1)	1.0
  (1, 3)	1.0
  (3, 5)	1.0
  (4, 4)	1.0
  (5, 0)	1.0
  (6, 2)	1.0
  (9, 6)	1.0


In [None]:
mat = pd.DataFrame(mat.toarray(), columns = vectorizer.get_feature_names_out())
mat

Unnamed: 0,consultancy,name,services,shruthi,tata,working,years
0,0,1,0,0,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0
4,0,0,0,0,1,0,0
5,1,0,0,0,0,0,0
6,0,0,1,0,0,0,0
7,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1


### Q1.
Write a regular expression to match all the files that have either .exe, .xml or .jar extensions. A valid file name can contain any alphabet, digit and underscore followed by the extension.

In [None]:
files = ['employees.xml', 'calculator.jar', 'nfsmw.exe', 'bkgrnd001.jpg', 'sales_report.ppt']

pattern = "^.+\.(xml|jar|exe)$"

result = []

for file in files:
    match = re.search(pattern, file)
    if match !=None:
        result.append(file)

# print result - result should only contain the items that match the pattern
print(result)

['employees.xml', 'calculator.jar', 'nfsmw.exe']


### Q2
Write a regular expression to match all the addresses that have Koramangala embedded in them.

Strings that should match:
* 466, 5th block, Koramangala, Bangalore
* 4th BLOCK, KORAMANGALA - 560034

Strings that shouldn't match:
* 999, St. Marks Road, Bangalore


In [None]:
addresses = ['466, 5th block, Koramangala, Bangalore', '4th BLOCK, KORAMANGALA - 560034', '999, St. Marks Road, Bangalore']

pattern = "^[\w\d\s,-]*koramangala[\w\d\s,-]*$"

result = []

for address in addresses:
    match = re.search(pattern, address, re.I)
    if match !=None:
        result.append(address)

# print result - result should only contain the items that match the pattern
print(result)

['466, 5th block, Koramangala, Bangalore', '4th BLOCK, KORAMANGALA - 560034']


### Q3.
Write a regular expression that matches either integer numbers or floats upto 2 decimal places.

Strings that should match:
* 2
* 2.3
* 4.56
* .61

Strings that shoudln't match:
* 4.567
* 75.8792
* abc


In [None]:
numbers = ['2', '2.3', '4.56', '.61', '4.567', '75.8792', 'abc']

pattern = "^[0-9]*(\.[0-9]{,2})?$"

result = []

for number in numbers:
    match = re.search(pattern, number)
    if match != None:
        result.append(number)

# print result - result should only contain the items that match the pattern
print(result)

['2', '2.3', '4.56', '.61']


### Q4.
Write a regular expression to match the model names of smartphones which follow the following pattern:

mobile company name followed by underscore followed by model name followed by underscore followed by model number

Strings that should match:
* apple_iphone_6
* samsung_note_4
* google_pixel_2

Strings that shouldn’t match:
* apple_6
* iphone_6
* google\_pixel\_


In [None]:
phones = ['apple_iphone_6', 'samsung_note_4', 'google_pixel_2', 'apple_6', 'iphone_6', 'google_pixel_']

pattern = "^.*_.*_\d$"

result = []

for phone in phones:
    match = re.search(pattern, phone)
    if match !=None:
        result.append(phone)

# print result - result should only contain the items that match the pattern
print(result)

['apple_iphone_6', 'samsung_note_4', 'google_pixel_2']


### Q5.
Write a regular expression that can be used to match the emails present in a database.

The pattern of a valid email address is defined as follows:
The '@' character can be preceded either by alphanumeric characters, period characters or underscore characters. The length of the part that precedes the '@' character should be between 4 to 20 characters.

The '@' character should be followed by a domain name (e.g. gmail.com). The domain name has three parts - a prefix (e.g. 'gmail'), the period character and a suffix (e.g. 'com'). The prefix can have a length between 3 to 15 characters followed by a period character followed by either of these suffixes - 'com', 'in' or 'org'.


Emails that should match:
* random.guy123@gmail.com
* mr_x_in_bombay@gov.in

Emails that shouldn’t match:
* 1@ued.org
* @gmail.com
* abc!@yahoo.in
* sam_12@gov.us
* neeraj@

In [None]:
emails = ['random.guy123@gmail.com', 'mr_x_in_bombay@gov.in', '1@ued.org',
          '@gmail.com', 'abc!@yahoo.in', 'sam_12@gov.us', 'neeraj@']

pattern = "^[a-z_.0-9]{4,20}@[a-z]{3,15}\.(com|in|org)$"

result = []

for email in emails:
    match = re.search(pattern, email, re.I)
    if match !=None:
        result.append(email)

# print result - result should only contain the items that match the pattern
print(result)

['random.guy123@gmail.com', 'mr_x_in_bombay@gov.in']
