In [110]:
import pandas as pd
import re
import nltk
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer


In [111]:
# Define Custom Functions for each step

# Covert to Lower case
def lowerCase(text):
    #return text.lower()
    return text.str.lower()

# Remove html tags
def removeHtmlTags(text):
    return text.apply(lambda x:re.sub(r'<.*?>',"",x))

# Remove Urls
def removeUrls(text):
    return text.apply(lambda x:re.sub(r'https?://\S+|www\.S\+',"",x))

# Remove NewLine 
def handleNewLine(text):
    return text.apply(lambda x:re.sub(r'\n',' ',x))

# Remove Special Characters
def removeSpecialChars(text):
    return text.apply(lambda x:re.sub(r'[^0-1a-zA-Z]',' ',x))

# Remove Stopwords
def removeStopwords(text):
    import nltk
    from nltk.corpus import stopwords
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    return text.apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# Remove Empty strings


# Stemming
def stemTokens(text):
    from nltk.stem import SnowballStemmer
    snowballStemmer = SnowballStemmer('english')
    
    return text.apply(lambda x: ' '.join([snowballStemmer.stem(word) for word in x.split() if word != " "]))


# Lemmetization
def lemTokens(text):
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    from nltk.stem import WordNetLemmatizer
    lem = WordNetLemmatizer()
 
    return text.apply(lambda x: ' '.join([lem.lemmatize(word) for word in x.split() if word != " "]))





In [112]:
df = pd.read_csv("Airbnb.csv")
df

Unnamed: 0.1,Unnamed: 0,Title,Location,reviewees,reviewers,reviews,ratings
0,0,UFO 'Futuro styled Flying Saucer'!,"Planes for Rent in Redberth, United Kingdom",Toby,Taya,Toby's listing is wonderful and unique. He was...,5
1,1,UFO 'Futuro styled Flying Saucer'!,"Planes for Rent in Redberth, United Kingdom",Toby,Philip Anthony,Amazing location. The attention to detail on t...,5
2,2,UFO 'Futuro styled Flying Saucer'!,"Planes for Rent in Redberth, United Kingdom",Toby,Vanessa,Great unique 'space' to stay. All facilities a...,5
3,3,UFO 'Futuro styled Flying Saucer'!,"Planes for Rent in Redberth, United Kingdom",Toby,Louise,Booke the ufo as a surprise 50th birthday brea...,5
4,4,UFO 'Futuro styled Flying Saucer'!,"Planes for Rent in Redberth, United Kingdom",Toby,Wayne,It's so unusual there is no comparison. Comple...,5
...,...,...,...,...,...,...,...
8676,8676,Three log cabins with WC and Sauna and Treehouse,"Huts for Rent in Llanwddyn, United Kingdom",Stuart,Sarah,.,5
8677,8677,Three log cabins with WC and Sauna and Treehouse,"Huts for Rent in Llanwddyn, United Kingdom",Stuart,Joanna,"Witam serdecznie, chciałbym podzielić się swoi...",5
8678,8678,Three log cabins with WC and Sauna and Treehouse,"Huts for Rent in Llanwddyn, United Kingdom",Stuart,Jeroen,We hebben een geweldig verblijf gehad bij Stua...,5
8679,8679,Three log cabins with WC and Sauna and Treehouse,"Huts for Rent in Llanwddyn, United Kingdom",Stuart,Frank,"Warm onthaal, een magische plaats met veel lie...",5


In [113]:
df.drop(columns = ['Unnamed: 0'],inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,Title,Location,reviewees,reviewers,reviews,ratings
0,UFO 'Futuro styled Flying Saucer'!,"Planes for Rent in Redberth, United Kingdom",Toby,Taya,Toby's listing is wonderful and unique. He was...,5
1,UFO 'Futuro styled Flying Saucer'!,"Planes for Rent in Redberth, United Kingdom",Toby,Philip Anthony,Amazing location. The attention to detail on t...,5
2,UFO 'Futuro styled Flying Saucer'!,"Planes for Rent in Redberth, United Kingdom",Toby,Vanessa,Great unique 'space' to stay. All facilities a...,5
3,UFO 'Futuro styled Flying Saucer'!,"Planes for Rent in Redberth, United Kingdom",Toby,Louise,Booke the ufo as a surprise 50th birthday brea...,5
4,UFO 'Futuro styled Flying Saucer'!,"Planes for Rent in Redberth, United Kingdom",Toby,Wayne,It's so unusual there is no comparison. Comple...,5


In [114]:
# Create pipeline
pipeline = Pipeline([
    ('lowerCase', FunctionTransformer(lowerCase)),
    ('removeHtmlTags', FunctionTransformer(removeHtmlTags)),
    ('removeUrls', FunctionTransformer(removeUrls)),
    ('handleNewLine', FunctionTransformer(handleNewLine)),
    ('removeSpecialChars', FunctionTransformer(removeSpecialChars)),
    ('removeStopwords', FunctionTransformer(removeStopwords)),
    ('lemTokens', FunctionTransformer(lemTokens)),
    ('vectorize', TfidfVectorizer())
])


In [115]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score as acs
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [116]:
# Recognize predictor or Input and target or output varibles for Machine learning
X = df['reviews']
y = df['ratings']

In [117]:
X_transformed = pipeline.fit_transform(X)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kishore\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kishore\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Kishore\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [118]:
X_transformed

<8680x9909 sparse matrix of type '<class 'numpy.float64'>'
	with 179173 stored elements in Compressed Sparse Row format>

In [119]:
#  split x & y in to xTrain, xTest, yTrain, yTest
# if we pass 2 arrays we get train & test of the first array, then train & test of the second array
# randome_state parameter in test_train_split is simillar to random_seed parameter in random function
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(X_transformed,y,
                                                test_size= 0.32)

In [120]:
# check if input variable train and test has same size as target variable
# the rows have to be same for us to proceed
xTrain.shape, xTest.shape, len(yTrain), len(yTest)

((5902, 9909), (2778, 9909), 5902, 2778)

# Logistic Regression

In [121]:
lr = LogisticRegression()

In [122]:
lr.fit(xTrain,yTrain)

LogisticRegression()

In [123]:
yTrain_pred = lr.predict(xTrain)
yTest_pred = lr.predict(xTest)

In [124]:
acs(yTrain,yTrain_pred)

0.8592002710945442

In [125]:
confusion_matrix(yTrain,yTrain_pred)

array([[   7,    0,    0,    0,    0,    1],
       [   0,    0,    0,    0,    4,   26],
       [   0,    0,    0,    0,   12,   34],
       [   0,    0,    0,   26,   34,  118],
       [   0,    0,    0,    0,  216,  594],
       [   0,    0,    0,    0,    8, 4822]], dtype=int64)

In [129]:
yTrain.value_counts()

5    4830
4     810
3     178
2      46
1      30
0       8
Name: ratings, dtype: int64

In [126]:
acs(yTest,yTest_pred)

0.8138948884089273

In [127]:
confusion_matrix(yTest,yTest_pred)

array([[   3,    0,    0,    0,    0,    0],
       [   0,    0,    0,    1,    3,   15],
       [   0,    0,    0,    0,    7,   10],
       [   0,    0,    0,    1,   25,   61],
       [   0,    0,    0,    0,   38,  371],
       [   0,    0,    0,    0,   24, 2219]], dtype=int64)

In [130]:
yTest.value_counts()

5    2243
4     409
3      87
1      19
2      17
0       3
Name: ratings, dtype: int64

In [128]:
# The predictions are pretty good