In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_json('/kaggle/input/two-sigma-connect-rental-listing-inquiries/train.json.zip')
train_df['description'].head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy import sparse

class MyCountVectorizer(CountVectorizer):
    def __init__(self, dtype=np.float64, batch_size=None, **kwargs):
        super().__init__(**kwargs)
        self.dtype = dtype
        self.batch_size = batch_size
    
    # Rewrite transform method
    def fit(self, X, y=None):
        super().fit(X, y)
        return self
    def transform(self, X):
        # If batch_size is set then transform data in batch
        if self.batch_size:
            n_samples = X.shape[0]
            if isinstance(X, pd.DataFrame)|isinstance(X, pd.Series):
                res = []
                for i in range(0, n_samples, self.batch_size):
                    print(f"Processsing row {i} to {min(i+self.batch_size, n_samples)}")
                    res.append(super().transform(X.iloc[i:min(i+self.batch_size, n_samples) ]).astype(self.dtype))
                return sparse.vstack(res)

            else:
                return sparse.vstack([super().transform(X[i:min(i+self.batch_size, n_samples) ]).astype(self.dtype)  
                        for i in range(0, n_samples, self.batch_size)])
        else:
            return super().transform(X).astype(self.dtype)

In [None]:
# Without batch size. Work as expected: using parent's fit and transform
xf = MyCountVectorizer()
xf.fit(train_df['description'])
X0 = xf.transform(train_df['description'])
X0.shape

In [None]:
# With batch size, fit and transform data in seperate steps.
# Work as expected: modeified transform was used
xf = MyCountVectorizer(batch_size=1000)
xf.fit(train_df['description'])
X1 = xf.transform(train_df['description'])
X1.shape

In [None]:
assert X1.sum()==X0.sum(),  "Unmatched results"

In [None]:
# with batch size, fit_transform
# As expected: since I didn't redefine fit_transform it would use parent's fit_transform which essentially performs the following
# self.fit(X, y).transform() 
# https://github.com/scikit-learn/scikit-learn/blob/15a949460/sklearn/feature_extraction/text.py#L807
# so what I thought was I would need to redefine fit_transform
xf = MyCountVectorizer(batch_size=1000)
X2 = xf.fit_transform(train_df['description'])
X2.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy import sparse

class MyCountVectorizer_1(CountVectorizer):
    def __init__(self, dtype=np.float64, batch_size=None, **kwargs):
        super().__init__(**kwargs)
        self.dtype = dtype
        self.batch_size = batch_size
    
    # Rewrite transform method
    def fit(self, X, y=None):
        super().fit(X, y)
        return self
    def transform(self, X):
        # If batch_size is set then transform data in batch
        if self.batch_size:
            n_samples = X.shape[0]
            if isinstance(X, pd.DataFrame)|isinstance(X, pd.Series):
                res = []
                for i in range(0, n_samples, self.batch_size):
                    print(f"Processsing row {i} to {min(i+self.batch_size, n_samples)}")
                    res.append(super().transform(X.iloc[i:min(i+self.batch_size, n_samples) ]).astype(self.dtype))
                return sparse.vstack(res)

            else:
                return sparse.vstack([super().transform(X[i:min(i+self.batch_size, n_samples) ]).astype(self.dtype)  
                        for i in range(0, n_samples, self.batch_size)])
        else:
            return super().transform(X).astype(self.dtype)
    # Redefine fit_transform to use self.transform instead of super().transform
    def fit_transform(X, y=None):
        super().fit(X, y)
        self.transform()

In [None]:
# However it didn't seem to work
# self.fit(X, y).transform()
xf
xf = MyCountVectorizer_1(batch_size=1000)
X2 = xf.fit_transform(train_df['description'])
X2.shape


In [None]:
# And evern executing fit and transform in seperate steps didn't work
del xf
xf = MyCountVectorizer_1(batch_size=1000)
xf.fit(train_df['description'])
X1 = xf.transform(train_df['description'])
X1.shape