In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor



In [3]:
df= pd.read_csv("public_maps (1).csv")

In [4]:
df.groupby("map_category_name").size()

map_category_name
Business         2167
Education        5820
Entertainment      99
Life              439
Other            3547
Productivity      586
Technology        902
dtype: int64

In [5]:
df.groupby("map_rating").size()

map_rating
10       21
20       24
30      112
32       12
33       19
36       34
40      117
43       25
45       64
47       13
49        7
50    13112
dtype: int64

In [68]:
df.head()

Unnamed: 0,map_id,map_title,map_rating,map_category_name,idea_id,idea_parent_id,idea_title
0,116533,PFK/Risikomanagement,50,Business,116533,,My first mindmap
1,116533,PFK/Risikomanagement,50,Business,116534,116533.0,Ideas for my novel ...
2,116533,PFK/Risikomanagement,50,Business,116535,116533.0,Welcome again!
3,116533,PFK/Risikomanagement,50,Business,116536,116535.0,We hope you\'ll have fun\nwith MindMeister ...
4,116533,PFK/Risikomanagement,50,Business,116537,116535.0,... and some great ideas too!


## The map ratings sample is heavily skewed towards 50

Below, only distinct map_ids are considered. The distribution of map ratings is massively skewed towards "50"

In [6]:
df.drop_duplicates("map_id")['map_rating'].value_counts() #drop duplicates selects distinct values of map_id

50    434
30      7
40      6
45      5
43      2
10      2
36      1
49      1
33      1
20      1
47      1
32      1
Name: map_rating, dtype: int64

## Exploring the map_category_name dsitribution


Below is the distribution of observations for each class.

Entertainment, Life and Productivity will be discarded due to the very low number of observations (<10, ~2% of the overall sample).

In [7]:
df.drop_duplicates("map_id")['map_category_name'].value_counts() #drop duplicates selects distinct values of map_id

Education        182
Other            128
Business          94
Technology        35
Life               9
Productivity       9
Entertainment      5
Name: map_category_name, dtype: int64

The imbalance is much more manageable after dropping the categories with <10 observations. Class weights can be used to augment the loss function to  be more sensitive to the minority classes.

In [80]:
df_uniques = df.drop_duplicates("map_id")

In [81]:
df_uniques_filtered = df_uniques[~df_uniques['map_category_name'].isin(["Life", "Productivity", "Entertainment"])]

Below I've fitted a decision tree model to assess whether "map_rating" has any association with the "map_category_name". I have also included map_id and idea_id even though I presume these to be autoincrementing ids that are not expected to have genuine association with the map_category_name.

In [128]:
tree = DecisionTreeRegressor()

In [131]:
idx = df_uniques_filtered[["map_id", "idea_id", "map_rating"]].dropna().index
idx_train, idx_test = train_test_split(idx, test_size=0.2, stratify=df_uniques_filtered[["map_category_name"]])


In [133]:
tree.fit(df_uniques_filtered.loc[idx_train, ["map_id", "idea_id", "map_rating"]], pd.Series(df_uniques_filtered["map_category_name"].factorize()[0], index=df_uniques_filtered.index )[idx_train])

DecisionTreeRegressor()

In [135]:
bool_prediction_istrue_series = (pd.Series(tree.predict(df_uniques_filtered.loc[idx_test, ["map_id", "idea_id", "map_rating"]])) == pd.Series(df_uniques_filtered["map_category_name"].factorize()[0], index=df_uniques_filtered.index)[idx_test].reset_index(drop=True))

accuracy = bool_prediction_istrue_series.sum() / bool_prediction_istrue_series.shape
accuracy[0]

0.5795454545454546

In [138]:
tree.feature_importances_ # "map_id", "idea_id", "map_rating" 

array([0.42425026, 0.56517177, 0.01057797])

According to the feature importance scores, map_rating is a poor feature for predicting the map_category_name, in contrast to map_id, idea_id which are spuriously associated at best.


Therefore I conclude that an NLP approach will be required, modelling some combination of map_title and idea_title.

## Exploring the idea_title dataset

There is a lot of duplicated default text such as "My First Mind Map" in the idea_title column. This default text will not be useful in predicting the map_category_name, so I will exclude it

In [145]:
 df['idea_title'].value_counts()[:40] # drop these

ENTER to add siblings                                                                                    192
DEL to delete                                                                                            192
TAB to insert (Mac OS)                                                                                   191
... and some great ideas too!                                                                            191
Get started!                                                                                             191
Use toolbar to add ideas                                                                                 191
Key shortcuts                                                                                            191
INS to insert (Windows)                                                                                  191
We hope you\'ll have fun\nwith MindMeister ...                                                           188
Ideas for my novel 

In [141]:
recurring_defult_strings = df['idea_title'].value_counts()[:40].index
recurring_defult_strings

Index(['ENTER to add siblings', 'DEL to delete', 'TAB to insert (Mac OS)',
       '... and some great ideas too!', 'Get started!',
       'Use toolbar to add ideas', 'Key shortcuts', 'INS to insert (Windows)',
       'We hope you\'ll have fun\nwith MindMeister ...',
       'Ideas for my novel ...', 'Welcome again!', 'My Geistesblitzes',
       'My First Mind Map', 'New node', 'All key shortcuts',
       'Check out \nhttp://www.mindmeister.com/services/tools/geistesblitz_widgets',
       'Drag &amp; Drop and double-click canvas',
       'Drag &amp; Drop and\rdouble-click canvas',
       'Find out more? Try http://www.mindmeister.com/help',
       'Email &amp; SMS Gateways', 'Geistesblitz Tools',
       'Find out more? Try\rhttp://www.mindmeister.com/help', 'Offline Mode',
       'Tools and Gadgets', 'Compare Editions', 'Get started now!', 'more...',
       'Meeting Minutes', 'Find out more?', 'Online Help', 'Project Plan',
       'Use Cases &amp; Templates', 'Personal Todo List', 'Vacat

In [142]:
df_cleaned_idea_title = df[~df['idea_title'].isin(recurring_defult_strings)]
df_cleaned_idea_title['text'] = df_cleaned_idea_title.groupby("map_id")['idea_title'].transform(lambda x: ' [SEP] '.join(x)).drop_duplicates()
df_input = df_cleaned_idea_title.drop_duplicates("map_id").dropna(subset="text")
df_input = df_input[~df_input['map_category_name'].isin(["Life", "Productivity", "Entertainment"])]

df_input.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned_idea_title['text'] = df_cleaned_idea_title.groupby("map_id")['idea_title'].transform(lambda x: ' [SEP] '.join(x)).drop_duplicates()


(251, 8)