In [1]:
import pickle
import os
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
with open('df_train_leftMerged.pickle','rb') as k:
    df_train = pickle.load(k)

In [3]:
df_train[['id_33','DeviceType','DeviceInfo']]

Unnamed: 0,id_33,DeviceType,DeviceInfo
0,,,
1,,,
2,,,
3,,,
4,2220x1080,mobile,SAMSUNG SM-G892A Build/NRD90M
5,,,
6,,,
7,,,
8,1334x750,mobile,iOS Device
9,,,


In [4]:
def preprocessing(line):
    if line is not np.NaN:
        line = str(line).lower()
        line = re.sub(r"[^\w\s]", '_', line)
        line = re.sub(r"\s+", ' ', line)
    return line

In [5]:
df_train['new_device_info'] = df_train['DeviceInfo'].map(preprocessing, na_action='ignore')

In [6]:
df_train[['DeviceInfo','new_device_info']]

Unnamed: 0,DeviceInfo,new_device_info
0,,
1,,
2,,
3,,
4,SAMSUNG SM-G892A Build/NRD90M,samsung sm_g892a build_nrd90m
5,,
6,,
7,,
8,iOS Device,ios device
9,,


In [8]:
df_train['DeviceInfo'].head

<bound method NDFrame.head of 0                                        NaN
1                                        NaN
2                                        NaN
3                                        NaN
4              SAMSUNG SM-G892A Build/NRD90M
5                                        NaN
6                                        NaN
7                                        NaN
8                                 iOS Device
9                                        NaN
10                                   Windows
11                                       NaN
12                                       NaN
13                                       NaN
14                                       NaN
15                                       NaN
16                                     MacOS
17                                   Windows
18                                       NaN
19                                       NaN
20                                       NaN
21                       

In [10]:
v = TfidfVectorizer()
x = v.fit_transform(df_train['new_device_info'].dropna())

In [16]:
x.toarray()[0].shape

(1615,)

In [18]:
x.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Tinkering with TF-IDF and Clustering.

In [66]:
rarray = np.random.randint(2,size=(5000,5))

In [67]:
from sklearn.cluster import KMeans, DBSCAN

In [68]:
kmeans = KMeans(n_clusters=10).fit(rarray)

In [69]:
type(kmeans)

sklearn.cluster.k_means_.KMeans

In [70]:
nrarray = np.random.randint(2,size=(10,5))
print(nrarray)
device_code = kmeans.predict(nrarray)

[[0 0 0 1 0]
 [0 0 1 1 1]
 [0 0 1 1 0]
 [0 0 1 1 1]
 [0 0 0 1 1]
 [0 0 0 0 1]
 [0 1 1 0 0]
 [1 1 0 0 0]
 [1 0 1 1 1]
 [1 1 1 0 0]]


In [71]:
device_code

array([9, 8, 9, 8, 8, 1, 0, 0, 2, 7])

In [85]:
db = DBSCAN(eps=0.3, min_samples=100).fit(rarray)

In [86]:
n_clusters = len(set(db.labels_))
print('Estimated number of clusters: %d' % n_clusters)

Estimated number of clusters: 32


In [87]:
type(db)

sklearn.cluster.dbscan_.DBSCAN

In [88]:
db.fit_predict(nrarray)

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1], dtype=int64)

In [89]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(rarray)

In [90]:
kmeans = KMeans(n_clusters=10).fit(X_scaled)

In [91]:
kmeans.predict(nrarray)

array([0, 0, 0, 0, 9, 5, 8, 2, 9, 2])

In [92]:
nrarray

array([[0, 0, 0, 1, 0],
       [0, 0, 1, 1, 1],
       [0, 0, 1, 1, 0],
       [0, 0, 1, 1, 1],
       [0, 0, 0, 1, 1],
       [0, 0, 0, 0, 1],
       [0, 1, 1, 0, 0],
       [1, 1, 0, 0, 0],
       [1, 0, 1, 1, 1],
       [1, 1, 1, 0, 0]])

In [93]:
X_scaled

array([[ 1.01409939,  0.98767594, -1.01126343,  1.01126343,  1.00561577],
       [-0.98609664, -1.01247784, -1.01126343, -0.98886202, -0.99441559],
       [-0.98609664, -1.01247784,  0.98886202,  1.01126343, -0.99441559],
       ...,
       [ 1.01409939,  0.98767594, -1.01126343, -0.98886202, -0.99441559],
       [-0.98609664, -1.01247784, -1.01126343,  1.01126343,  1.00561577],
       [-0.98609664,  0.98767594,  0.98886202,  1.01126343, -0.99441559]])