# RC Text Recommender Data Exploration
Author: ppetruneac <br>

This notebook 
- explores the original data, 
- makes assumptions as what data / resources to keep




In [1]:
import pandas as pd
import re, string, warnings

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline

## Read

Sterg date care au NA in title, content sau in type. 

In [14]:
import pandas as pd
df = pd.read_csv('../data/sample/tilda_separated.csv', delimiter='~')

# Strip the missing values. 
df = df[(~df.title.isna()) & (~df.content.isna()) & (~df.type.isna())]

print("shape = {}".format(df.shape))
display(df.head())


shape = (85183, 4)


Unnamed: 0,id,title,type,content
0,1,Întrucât atârnă de voi,1,VERSE 1:\r\nb\t\t F#\r\nÎntrucât atârnă de ...
1,2,God will make a way,1,D\r\nGod will make a way\r\n ...
2,3,King of Glory,1,E B\r\nWho is this King o...
3,4,Curgi ca un râu de slavă,1,E\r\n Curgi ca un râu de slavă\r\n ...
4,5,Inima mea tânjește,1,C F C G\r\nInima mea...


## Add datetime

In [3]:
df = pd.read_csv(filepath_or_buffer='../data/sample/tilda_separated_filter_clean.csv')
df.head()

Unnamed: 0,id,title,type,content
0,72714,boboteaza,7,boboteaza iordanul martorul t cut sl vea prin ...
1,160304,in lumini de primavara,7,in lumini de primavara se nnoiesc vesminte pes...
2,128808,indemn pentru credinciosi,5,ndemn pentru credinciosi ospetele se fac pentr...
3,1481,ma intreb,2,ma intreb doamne tata cum am existat in lume f...
4,131290,"barierele din calea ta nu le poti muta singur,...",6,barierele din calea ta nu le poti muta singur ...


In [6]:
# Add a random date to the original data
import random
from datetime import datetime, timedelta

min_year=2017
max_year=datetime.now().year

start = datetime(min_year, 1, 1, 00, 00, 00)
years = max_year - min_year + 1
# end = start + timedelta(days=180 * years)
end = datetime.today()

random_date = []
for i in range(df.shape[0]):
    random_date.append(start + (end - start) * random.random())
    
df['datetime'] = random_date
display(df.head())

df.to_csv(path_or_buf='../data/sample/tilda_separated_filter_clean.csv', header=True, index=False)

Unnamed: 0,id,title,type,content,datetime
0,72714,boboteaza,7,boboteaza iordanul martorul t cut sl vea prin ...,2018-07-21 19:09:32.945341
1,160304,in lumini de primavara,7,in lumini de primavara se nnoiesc vesminte pes...,2017-01-21 20:44:51.991324
2,128808,indemn pentru credinciosi,5,ndemn pentru credinciosi ospetele se fac pentr...,2018-09-09 18:17:21.350718
3,1481,ma intreb,2,ma intreb doamne tata cum am existat in lume f...,2017-01-02 11:23:22.443332
4,131290,"barierele din calea ta nu le poti muta singur,...",6,barierele din calea ta nu le poti muta singur ...,2017-10-31 10:44:45.013566


## Data Exploration

> **Numarul resurselor si media lungimii textului pe tipul de resursa**

Urmatorele tipuri de resurse nu sunt de tip text, de accea se vor exclude. 

- 15 - par sa fie partituri
- 17, 18, 20, 22, 30, 31 - par sa fie resurse audio (? si/sau video)
- 29, 40, 48 - resurse video (e.g. marturii)
- 35 - nu stiu ce fel de resursa este
- 1 este sceneta; doar una singura cu o lungime foarte mare, de aceea va fi exclusa.

In [16]:
# resource_type_name = ['acorduri', 'cantece', 'devotionale', 'editoriale', 'eseuri', 'maxime', 'poezii', 'schite', 'studii',
#                 'predici', 'carti', 'scenete', 'partituri', '17', '18', '20', '22', '29', '30', '31', '35', 
#                  'Jocuri','lectia zilnica', '40', 'biografii', 'marturii', 'programe crestine', 'cugetari', 
#                 'dezbateri', '48']

resource_type2remove = [15,17,18,20,22,30,31,29,40,48,35,12]
df = df.loc[~df.type.isin(resource_type2remove)]
df.groupby('type')['content'].count()


type
1      2640
2     11877
3      5476
4       527
5      4100
6     13366
7     42201
8       955
9       813
10     1751
11       14
37       46
39      119
42       65
43      314
45       56
46      787
47       75
Name: content, dtype: int64

## Write

In [17]:
df.to_csv(path_or_buf='../data/interim/tilda_separated_filter.csv', header=True, index=False, sep="~")
print(df.shape)

pd.read_csv(filepath_or_buffer='../data/interim/tilda_separated_filter.csv', sep='~').head()

(85182, 5)


Unnamed: 0,id,title,type,content,datetime
0,1,Întrucât atârnă de voi,1,VERSE 1:\r\nb\t\t F#\r\nÎntrucât atârnă de ...,2018-10-30 17:56:43.820838
1,2,God will make a way,1,D\r\nGod will make a way\r\n ...,2019-01-16 17:02:17.204537
2,3,King of Glory,1,E B\r\nWho is this King o...,2018-06-04 02:36:54.428884
3,4,Curgi ca un râu de slavă,1,E\r\n Curgi ca un râu de slavă\r\n ...,2018-03-04 02:23:32.413976
4,5,Inima mea tânjește,1,C F C G\r\nInima mea...,2017-01-24 22:52:51.879640
