In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix

# Linguistic street map of Singapore

There is considerable linguistic variety in Singapore road names: Malaysians names (Jalan Besar), British names (Northumberland road), Chinese names (Keong Saik Road), Indian names (Veerasamy road), Jewish names (Belilios road), and the usual generic sorts of names that describe either area landmarks or other common noun.

![Image](streets.png)

In [2]:
# load the dataset
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/streets.csv'
street_names = pd.read_csv(url)
street_names.head(30)

Unnamed: 0,name,tag,origin
0,Saiboo,Street,Indian
1,Merchant,Loop,Generic
2,Hill,Street,Generic
3,Ophir,Road,Malay
4,Buona,Vista Road,Other
5,Sengkang,Avenue,Chinese
6,Ang Mo Kio,Avenue,Chinese
7,Brickland,Road,British
8,Choa Chu Kang,Road,Chinese
9,Woodlands,Avenue,Generic


**``Origin`` column values:**

- Chinese (all dialects including Cantonese, Hokkien, Mandarin, etc)
- Malay
- Indian (all languages of the subcontinent)
- British
- Generic (Race Course Road, Sunrise Place, etc)
- Other (Other languages).

In [3]:
street_names.origin.value_counts()

Malay      927
British    798
Generic    497
Chinese    403
Other      167
Indian      42
Name: origin, dtype: int64

The **goal** is to predict the ``origin`` column from the ``street`` and ``tag`` columns.

**Part 1:** Add the following features:

- Number of words in road name (more words => more likely to be Chinese.) Assign it to a column named `n_words`.
- Average word length in road name (longer words => more likely to be British or Indian.) Assign it to a column named `avg_word_len`.
- Is the road tag Malay? (if yes => very correlated with being Malay.) **Malay tags**: *Jalan*, *Lorong*, *Bukit*, *Lengkok*, *Taman*, *Kampong*, *Lengkong*. Assign it to a column named `is_malay`.

In [4]:

def N_words_text(text):
    words=word_tokenize(text)    
    return len(words)

In [5]:
def avg_word_leng(text):
    words=word_tokenize(text)
    total_leng=0
    for word in words:
        total_leng+=len(word)
    return total_leng/len(words)

In [6]:
malay=['Jalan', 'Lorong', 'Bukit', 'Lengkok', 'Taman', 'Kampong', 'Lengkong']
street_names['is_maylay'] = np.where(street_names.tag.isin(malay), 1, 0)
    

In [7]:
street_names['n_words']=street_names.name.apply(N_words_text)

In [9]:
street_names['avg_word_len']=street_names.name.apply(avg_word_leng)

**Part 2:** define the matrix X (columns `name`, `n_words`, `avg_word_len`, `is_malay`) and the target vector y (column `origin`)

In [19]:
X=street_names.drop(['origin','tag'],axis='columns')
X

Unnamed: 0,name,is_maylay,n_words,avg_word_len
0,Saiboo,0,1,6.0
1,Merchant,0,1,8.0
2,Hill,0,1,4.0
3,Ophir,0,1,5.0
4,Buona,0,1,5.0
...,...,...,...,...
2829,Florence,0,1,8.0
2830,Hoot Kiam,0,2,4.0
2831,Clarke,0,1,6.0
2832,Countryside,0,1,11.0


In [20]:
y=street_names.origin
y

0        Indian
1       Generic
2       Generic
3         Malay
4         Other
         ...   
2829    Chinese
2830    Chinese
2831    British
2832    Generic
2833      Other
Name: origin, Length: 2834, dtype: object

**Part 3:** split X and y into training and testing sets.

In [21]:
X_train,X_test,y_train,y_test= train_test_split(X,y)

In [22]:
y_test

761     Chinese
2833      Other
1165    Generic
2086    Generic
1301      Malay
         ...   
2819    Chinese
1504      Malay
307     British
644       Other
2831    British
Name: origin, Length: 709, dtype: object

**Part 4:** build a classification pipeline.

The pipeline must use `CountVectorizer` to compute the number of **character 1-gram, 2-grams, 3-grams and 4-grams** from the column `name` ([CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer) has an `analyzer` parameter that you'll have to set to `analyzer='char'`.)

In [27]:

processor=ColumnTransformer(transformers=[
    ('count_vect',CountVectorizer(max_features=20,ngram_range=(1,4), analyzer='char'),'name'),
])

In [37]:
from sklearn.linear_model import LogisticRegression
pipe=Pipeline(steps=[
    ('processor',processor),
    ('clf',LogisticRegression(max_iter=5000))
])

**Part 5:** Fit the classification pipeline to the training data, and  evaluate its performance on the testing set.

In [38]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('processor',
                 ColumnTransformer(transformers=[('count_vect',
                                                  CountVectorizer(analyzer='char',
                                                                  max_features=20,
                                                                  ngram_range=(1,
                                                                               4)),
                                                  'name')])),
                ('clf', LogisticRegression(max_iter=5000))])

In [39]:
y_test_pred=pipe.predict(X_test)

In [40]:
accuracy_score(y_test,y_test_pred)

0.5895627644569816

In [41]:
confusion_matrix(y_test,y_test_pred)

array([[115,   6,  18,   0,  41,   0],
       [  7,  88,   0,   0,  12,   0],
       [ 71,   5,  28,   0,  27,   0],
       [  6,   0,   1,   0,   2,   0],
       [ 36,  13,   4,   0, 187,   0],
       [ 16,   3,   7,   0,  16,   0]], dtype=int64)