## To follow course notebook go to: 
### https://drive.google.com/drive/u/0/folders/1IBIX-XVcGw89t-2nr4cm3CfvLIHC8yh0

## Basic String Manipulation with Python

In [18]:
my_string = 'Lionel Messi is the GoAt.'

In [19]:
print(my_string.lower())

lionel messi is the goat.


In [20]:
print(my_string.upper())

LIONEL MESSI IS THE GOAT.


In [21]:
print(my_string.split(' '))

['Lionel', 'Messi', 'is', 'the', 'GoAt.']


In [22]:
first_string = 'Aaha Tamatar bade majedaar'
second_string = 'Wah Tamater bade majedaar'

In [23]:
result = first_string + ' ' + second_string
print(result)

Aaha Tamatar bade majedaar Wah Tamater bade majedaar


In [24]:
print(len(first_string))

26


In [25]:
# Count lower case 'a'
first_string.count('a')

9

In [26]:
substring = first_string[0:12]
print(substring)

Aaha Tamatar


## Regular Expression

In [27]:
import re

### 1. ^ (Caret)
 
- Description: Matches the start of a string.
- Usage: Used to check if a string starts with a certain character or pattern.

In [28]:
pattern = r"^Hello"
text = "Hello world"
match = re.match(pattern, text)
print(match)  # Output: <re.Match object; span=(0, 5), match='Hello'>

text2 = "Hi, Hello world"
match2 = re.match(pattern, text2)
print(match2)  # Output: None

<re.Match object; span=(0, 5), match='Hello'>
None


### 2. $ (Dollar Sign)

- Description: Matches the end of a string.
- Usage: Used to check if a string ends with a certain character or pattern.

In [29]:
pattern = r"world$"
text = "Hello world"
match = re.search(pattern, text)
print(match)  # Output: <re.Match object; span=(6, 11), match='world'>

text2 = "Hello world!"
match2 = re.search(pattern, text2)
print(match2)  # Output: None

<re.Match object; span=(6, 11), match='world'>
None


### 3. ? (Question Mark)

- Description: Matches 0 or 1 occurrence of the preceding element.
- Usage: Used to denote that the preceding character or group is optional.

In [30]:
pattern = r"colou?r"
text1 = "color"
text2 = "colour"
text3 = "colouur"

match1 = re.search(pattern, text1)
print(match1)  # Output: <re.Match object; span=(0, 5), match='color'>

match2 = re.search(pattern, text2)
print(match2)  # Output: <re.Match object; span=(0, 6), match='colour'>

match3 = re.search(pattern, text3)
print(match3)  # Output: None

<re.Match object; span=(0, 5), match='color'>
<re.Match object; span=(0, 6), match='colour'>
None


### 4. . (Dot)

- Description: Matches any single character except a newline.
- Usage: Used to match any character at a specific position.

In [34]:
pattern = r"h.t"
text = "hat hot hit hut ram"
matches = re.findall(pattern, text)
print(matches)  # Output: ['hat', 'hot', 'hit', 'hut']

['hat', 'hot', 'hit', 'hut']


### 5. {m,n} (Curly Braces)

- Description: Matches between m and n occurrences of the preceding element.
- Usage: Used to specify the minimum and maximum number of times the preceding element can occur.

In [32]:
pattern = r"a{1,3}"
text = "a aa aaa aaaa"
matches = re.findall(pattern, text)
print(matches)  # Output: ['a', 'aa', 'aaa', 'aaa']

pattern2 = r"b{2,4}"
text2 = "b bb bbb bbbb bbbbb"
matches2 = re.findall(pattern2, text2)
print(matches2)  # Output: ['bb', 'bbb', 'bbbb']

['a', 'aa', 'aaa', 'aaa', 'a']
['bb', 'bbb', 'bbbb', 'bbbb']


### 6. | (Pipe)
- Description: Acts as an OR operator, matching the pattern before or after the pipe.
- Usage: Used to match one of several patterns.

In [33]:
pattern = r"cat|dog"
text = "I have a cat and a dog."
matches = re.findall(pattern, text)
print(matches)  # Output: ['cat', 'dog']

pattern2 = r"apple|banana|cherry"
text2 = "I like apples, bananas, and cherries."
matches2 = re.findall(pattern2, text2)
print(matches2)  # Output: []

['cat', 'dog']
['apple', 'banana']


In [38]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/sudip/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [39]:
import nltk
from nltk.corpus import movie_reviews
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Convert the dataset to a DataFrame
df = pd.DataFrame(documents, columns=['text', 'label'])

# Combine the words into a single string for each document
df['text'] = df['text'].apply(lambda x: ' '.join(x))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = clf.predict(X_test_tfidf)

# Evaluate the classifier
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

Accuracy: 0.8075
              precision    recall  f1-score   support

         neg       0.76      0.90      0.82       199
         pos       0.88      0.72      0.79       201

    accuracy                           0.81       400
   macro avg       0.82      0.81      0.81       400
weighted avg       0.82      0.81      0.81       400

