In [1]:
import re
import pandas as pd
import numpy as np
from collections import Counter
from textblob import TextBlob
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# globals
stop = stopwords.words('english')

## Load data and clean

In [18]:
df = pd.read_json('../data/craigslist_music_scraped.json')
df.shape

(24998, 9)

In [19]:
# de-duplicate exact raw posts
df = df.loc[~df.duplicated(subset=['body'], keep='first')]
df.shape

(19920, 9)

In [20]:
def dup_clean(s):
    if type(s) != str:
        return ""
    no_qr = re.sub("QR Code Link to This Post", '', s).lower().strip()
    return no_qr

In [21]:
# modify text body to lowercase, remove QR tag and strip
df.body = df.body.apply(dup_clean)
# de-duplicate exact processed posts
df = df.loc[~df.duplicated(subset=['body'], keep='first')]
# combine title and body
df['body'] = df.title + ' \n' + df.body
df.shape

(19506, 9)

## Preprocessing

In [6]:
def prep_body_keepnum(s):
    if type(s) != str:
        return ""
    no_https = re.sub("https?://[^\s]+", '', s)
    no_http = re.sub("http?://[^\s]+", '', no_https)
    no_www = re.sub("www?[^\s]+", '', no_http)
    words = TextBlob(no_www).words
    words = [w for w in words if w not in stop]
    prep_body = ' '.join(words)
    return prep_body

def prep_body(s):
    if type(s) != str:
        return ""
    no_https = re.sub("https?://[^\s]+", '', s)
    no_http = re.sub("http?://[^\s]+", '', no_https)
    no_www = re.sub("www?[^\s]+", '', no_http)
    body = re.sub("[0-9]",'',no_www).strip()
    words = TextBlob(body).words
    words = [w for w in words if w not in stop]
    prep_body = ' '.join(words)
    return prep_body

def compress(iterables):
    outer = []
    for it in iterables:
        if type(it) != list:
            outer.append(it)
        else:
            outer += it
    return Counter(outer)

def extract_genres(s):
    genre_r = []
    for genre in genres:
        if genre in s:
            genre_r.append(genre_map[genre])
    return compress(genre_r)

def extract_instruments(s):
    instr_r = []
    for instrument in instrument_types:
        if instrument in s:
            instr_r.append(instruments[instrument])
    return compress(instr_r)

In [7]:
df["prep_body_keepnum"] = df["body"].apply(prep_body_keepnum)
df["prep_body"] = df["body"].apply(prep_body)
df["body_len"] = df["prep_body"].apply(lambda x: len(x))

## Stemming

In [8]:
def stem_text(body):
    words = TextBlob(body).words
    stemmer = PorterStemmer()
    stems = [stemmer.stem(word) for word in words]
    stemmed = ' '.join(stems)
    return stemmed

In [9]:
df["prep_body"] = df["prep_body"].apply(str)
df["prep_body_keepnum"] = df["prep_body_keepnum"].apply(str)
df["prep_body"] = df["prep_body"].apply(stem_text)
df["prep_body_keepnum"] = df["prep_body_keepnum"].apply(stem_text)

## De-duplicate based on cosine similarity

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [11]:
posts = df.prep_body.values.tolist()
vectorizer = CountVectorizer()
count_vec = vectorizer.fit_transform(posts)

In [12]:
count_vec.shape

(19506, 33181)

In [19]:
threshold = 0.98
kill = set()
for i in range(count_vec.shape[0]):
    cosine_similarities = cosine_similarity(count_vec[i:i+1], count_vec).flatten()
    indices = np.where(cosine_similarities > threshold)[0]
    indices = indices[np.where(indices > i)]
    for kill_index in indices:
        kill.add(kill_index)

In [21]:
len(kill)

1140

In [26]:
df.shape

(19506, 12)

In [56]:
df_dedup = df.drop(df.index[kill])

In [57]:
df_dedup.shape

(18366, 12)

## Export data for modelling

In [59]:
df_dedup.to_json('../data/posts_preprocessed.json')

In [None]:
df_viz = df_viz.dropna()

df_viz['topic'] = pd.Series(np.random.choice(5, len(df_viz))).values
df_viz = df_viz[["gmap_lon", "gmap_lat", "city", "topic"]]
df_viz.to_json('../data/map_data.json')

## Get genre / instrument categories

In [None]:
from mappings.genre_map import genre_map
from mappings.instruments import instruments
# globals
genres = genre_map.keys()
instrument_types = instruments.keys()

In [None]:
df["genre_counts"] = df["prep_body"].apply(extract_genres)
df["instrument_counts"] = df["prep_body"].apply(extract_instruments)

In [None]:
df.instrument_counts

In [None]:
genre_counts = pd.DataFrame(df["genre_counts"].tolist()).fillna(0)
instr_counts = pd.DataFrame(df["instrument_counts"].tolist()).fillna(0)
df = pd.concat([df, genre_counts, instr_counts], axis=1)

In [None]:
df.head()

In [None]:
df[genre_counts.columns].describe()

In [None]:
df[instr_counts.columns].describe()