## Random animal data set

In [24]:
import random

nDocs = 20

# create a set of documents with random number of terms
terms = ['dog', 'cat', 'horse', 'rabit', 'ostrich', 'bear', 'tiger', 'lion', 'bird']
dfs = [nDocs * max(int(100 // (i + 3) ** 0.5), 1) // 100 for i, _ in enumerate(terms)]
index = [[] for _ in range(nDocs)]
for term, df in zip(terms, dfs):
    for i in sorted(random.sample(range(nDocs), df)):
        tf = random.randint(1, 1)
        index[i].extend([term] * tf)
documents = [{'id': doc_id, 'text': ' '.join(index[doc_id - 1])} for doc_id in range(1, nDocs + 1)]

documents

[{'id': 1, 'text': 'dog ostrich bear lion bird'},
 {'id': 2, 'text': 'dog horse rabit ostrich tiger bird'},
 {'id': 3, 'text': 'rabit lion'},
 {'id': 4, 'text': 'bear'},
 {'id': 5, 'text': 'dog cat bear lion'},
 {'id': 6, 'text': 'ostrich bear lion'},
 {'id': 7, 'text': 'dog cat ostrich bear'},
 {'id': 8, 'text': 'dog horse rabit ostrich'},
 {'id': 9, 'text': 'bear tiger lion bird'},
 {'id': 10, 'text': 'cat horse rabit bird'},
 {'id': 11, 'text': 'cat ostrich'},
 {'id': 12, 'text': 'cat'},
 {'id': 13, 'text': 'dog cat horse rabit bird'},
 {'id': 14, 'text': 'cat tiger bird'},
 {'id': 15, 'text': 'dog horse rabit'},
 {'id': 16, 'text': 'cat horse rabit ostrich'},
 {'id': 17, 'text': 'dog cat tiger'},
 {'id': 18, 'text': 'dog horse rabit'},
 {'id': 19, 'text': 'dog bear tiger'},
 {'id': 20, 'text': 'dog cat horse tiger lion'}]

## IMDB Movies data set

[Link to source](https://www.kaggle.com/datasets/harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows)

##### **Usage**
```
from datasets.docs import imdb
data = imdb.load()
```

##### **Format**
```
Document = TypedDict('Document', {'id': int, 'title': str, 'year': int, 'runtime': int, 'rating': float, 'genre': str, 'actors': str, 'summary': str})
DocumentCollection = dict[int, Document]
```
The data set contains 1000 movie titles with the document ids as keys and the following properties for `MovieDoc`.
| name | example|
|----|----|
| id        | 1 |
| title     | The Shawshank Redemption |
| year      | 1994 |
| runtime   | 142 |
| rating    | 9.3 |
| genre     | Drama |
| actors    | Tim Robbins Morgan Freeman Bob Gunton William Sadler |
| summary   | Two imprisoned men bond over a number of years finding solace and eventual redemption through acts of common decency |


In [37]:
import pandas as pd

df = pd.read_csv('docs/imdb_top_1000.csv')
df.drop(['Poster_Link', 'Certificate', 'Meta_score', 'Director', 'No_of_Votes', 'Gross'], axis=1, inplace=True)

# rename columns to produce dictionary
df.rename(columns={'Series_Title': 'title'}, inplace=True)
df.rename(columns={'Released_Year': 'year'}, inplace=True)
df.rename(columns={'Runtime': 'runtime'}, inplace=True)
df.rename(columns={'Genre': 'genre'}, inplace=True)
df.rename(columns={'IMDB_Rating': 'rating'}, inplace=True)
df.rename(columns={'Overview': 'summary'}, inplace=True)

# replace content
df['runtime'].replace(to_replace=r'(\d*) min', value=r'\1', regex=True, inplace=True)
df['genre'].replace(r',', '', regex=True, inplace=True)
df['actors'] = df[['Star1', 'Star2', 'Star3', 'Star4']].apply(lambda x: ' '.join(x), axis=1)

# convert to target type
df['runtime'] = df['runtime'].astype(int)
df.drop(['Star1', 'Star2', 'Star3', 'Star4'], axis=1, inplace=True)

# reorder columns
df = df[['title', 'year', 'runtime', 'rating', 'genre', 'actors', 'summary']]

# show result
df.info()
df.to_dict('records')[:3]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       1000 non-null   int64  
 1   title    1000 non-null   object 
 2   year     1000 non-null   int64  
 3   runtime  1000 non-null   int32  
 4   rating   1000 non-null   float64
 5   genre    1000 non-null   object 
 6   actors   1000 non-null   object 
 7   summary  1000 non-null   object 
dtypes: float64(1), int32(1), int64(2), object(4)
memory usage: 58.7+ KB


[{'id': 1,
  'title': 'The Shawshank Redemption',
  'year': 1994,
  'runtime': 142,
  'rating': 9.3,
  'genre': 'Drama',
  'actors': 'Tim Robbins Morgan Freeman Bob Gunton William Sadler',
  'summary': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'},
 {'id': 2,
  'title': 'The Godfather',
  'year': 1972,
  'runtime': 175,
  'rating': 9.2,
  'genre': 'Crime Drama',
  'actors': 'Marlon Brando Al Pacino James Caan Diane Keaton',
  'summary': "An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son."},
 {'id': 3,
  'title': 'The Dark Knight',
  'year': 2008,
  'runtime': 152,
  'rating': 9.0,
  'genre': 'Action Crime Drama',
  'actors': 'Christian Bale Heath Ledger Aaron Eckhart Michael Caine',
  'summary': 'When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of 